From bc3c2fba5b45b882cbb619f4275f0c92c6ff6e68 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 30 Mar 2023 02:39:09 +0000
Subject: [PATCH 01/27] math utils in autogen

---
 flaml/autogen/__init__.py                   |   0
 flaml/autogen/math_utils.py                 | 310 +++++++
 flaml/integrations/oai/completion.py        |  31 +-
 notebook/integrate_chatgpt.ipynb            | 945 ++++++++++++--------
 notebook/integrate_openai.ipynb             |   4 +-
 website/docs/Examples/Integrate - OpenAI.md |   2 +-
 6 files changed, 913 insertions(+), 379 deletions(-)
 create mode 100644 flaml/autogen/__init__.py
 create mode 100644 flaml/autogen/math_utils.py

diff --git a/flaml/autogen/__init__.py b/flaml/autogen/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/flaml/autogen/math_utils.py b/flaml/autogen/math_utils.py
new file mode 100644
index 0000000000..94388b4b26
--- /dev/null
+++ b/flaml/autogen/math_utils.py
@@ -0,0 +1,310 @@
+from typing import Optional
+
+
+def remove_boxed(string: str) -> Optional[str]:
+    """Source: https://github.com/hendrycks/math
+    Extract the text within a \\boxed{...} environment.
+    Example:
+    >>> remove_boxed(\\boxed{\\frac{2}{3}})
+    \\frac{2}{3}
+    """
+    left = "\\boxed{"
+    try:
+        assert string[: len(left)] == left
+        assert string[-1] == "}"
+        return string[len(left) : -1]
+    except Exception:
+        return None
+
+
+def last_boxed_only_string(string: str) -> Optional[str]:
+    """Source: https://github.com/hendrycks/math
+    Extract the last \\boxed{...} or \\fbox{...} element from a string.
+    """
+    idx = string.rfind("\\boxed")
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def _fix_fracs(string: str) -> str:
+    """Source: https://github.com/hendrycks/math
+    Reformat fractions.
+    Examples:
+    >>> _fix_fracs("\\frac1b")
+    \frac{1}{b}
+    >>> _fix_fracs("\\frac12")
+    \frac{1}{2}
+    >>> _fix_fracs("\\frac1{72}")
+    \frac{1}{72}
+    """
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except Exception:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string: str) -> str:
+    """Source: https://github.com/hendrycks/math
+    Reformat fractions formatted as a/b to \\frac{a}{b}.
+    Example:
+    >>> _fix_a_slash_b("2/3")
+    \frac{2}{3}
+    """
+    if len(string.split("/")) != 2:
+        return string
+    a_str = string.split("/")[0]
+    b_str = string.split("/")[1]
+    try:
+        a = int(a_str)
+        b = int(b_str)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except Exception:
+        return string
+
+
+def _remove_right_units(string: str) -> str:
+    """Source: https://github.com/hendrycks/math
+    Remove units (on the right).
+    "\\text{ " only ever occurs (at least in the val set) when describing units.
+    """
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+
+
+def _fix_sqrt(string: str) -> str:
+    """Source: https://github.com/hendrycks/math
+    Reformat square roots.
+    Example:
+    >>> _fix_sqrt("\\sqrt3")
+    \\sqrt{3}
+    """
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+
+
+def _strip_string(string: str) -> str:
+    """Source: https://github.com/hendrycks/math
+    Apply the reformatting helper functions above.
+    """
+    # linebreaks
+    string = string.replace("\n", "")
+    # print(string)
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # print(string)
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # print(string)
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # print(string)
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # print(string)
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = _remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc.
+    # Even works with \frac1{72} (but not \frac{72}1).
+    # Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    return string
+
+
+def get_answer(solution: Optional[str]) -> Optional[str]:
+    if solution is None:
+        return None
+    last_boxed = last_boxed_only_string(solution)
+    if last_boxed is None:
+        return None
+    answer = remove_boxed(last_boxed)
+    if answer is None:
+        return None
+    return answer
+
+
+def is_equiv(str1: Optional[str], str2: Optional[str]) -> float:
+    """Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in
+    - units
+    - fractions
+    - square roots
+    - superfluous LaTeX.
+    Source: https://github.com/hendrycks/math
+    """
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return 1.0
+    if str1 is None or str2 is None:
+        return 0.0
+
+    try:
+        ss1 = _strip_string(str1)
+        ss2 = _strip_string(str2)
+        return float(ss1 == ss2)
+    except Exception:
+        return float(str1 == str2)
+
+
+def is_equiv_chain_of_thought(str1: str, str2: str) -> float:
+    """Strips the solution first before calling `is_equiv`."""
+    ans1 = get_answer(str1)
+    ans2 = get_answer(str2)
+
+    return is_equiv(ans1, ans2)
+
+
+def voting_counts(responses):
+    answers = {}
+    for i in range(len(responses)):
+        equiv = i
+        if get_answer(responses[i]) is None:
+            # ignore None answers
+            continue
+        for j in answers:
+            if is_equiv_chain_of_thought(responses[i], responses[j]):
+                equiv = j
+                break
+        if equiv in answers:
+            answers[equiv] += 1
+        else:
+            answers[equiv] = 1
+    return answers
+
+
+def success_metrics(responses, solution, **args):
+    """Check if each response is correct.
+
+    Args:
+        responses (list): The list of responses.
+        solution (str): The canonical solution.
+
+    Returns:
+        dict: The success metrics.
+    """
+    success_list = []
+    n = len(responses)
+    for i in range(n):
+        response = responses[i]
+        succeed = is_equiv_chain_of_thought(response, solution)
+        success_list.append(succeed)
+    # voting
+    answers = voting_counts(responses)
+    # find the answer with highest votes in answers
+    answer = max(answers.items(), key=lambda x: x[1], default=(0, 0))[0]
+    # check if the answer is correct
+    success_vote = is_equiv_chain_of_thought(responses[answer], solution)
+    return {
+        "expected_success": 1 - pow(1 - sum(success_list) / n, n),
+        "success": any(s for s in success_list),
+        "success_vote": success_vote,
+        "voted_answer": responses[answer],
+    }
diff --git a/flaml/integrations/oai/completion.py b/flaml/integrations/oai/completion.py
index 0a7ce3d40c..41c2eff363 100644
--- a/flaml/integrations/oai/completion.py
+++ b/flaml/integrations/oai/completion.py
@@ -291,7 +291,8 @@ def eval(cls, config: dict, prune=True, eval_only=False):
         else:
             start_n = config_n
         params = config.copy()
-        params["stop"] = stop
+        if "stop" in config:
+            params["stop"] = stop
         temperature_or_top_p = params.pop("temperature_or_top_p", None)
         if temperature_or_top_p:
             params.update(temperature_or_top_p)
@@ -490,20 +491,20 @@ def tune(
                 The function should take a list of responses and a data point as input,
                 and return a dict of metrics. For example,
 
-            ```python
-            def eval_func(responses, **data):
-                solution = data["solution"]
-                success_list = []
-                n = len(responses)
-                for i in range(n):
-                    response = responses[i]
-                    succeed = is_equiv_chain_of_thought(response, solution)
-                    success_list.append(succeed)
-                return {
-                    "expected_success": 1 - pow(1 - sum(success_list) / n, n),
-                    "success": any(s for s in success_list),
-                }
-            ```
+        ```python
+        def eval_func(responses, **data):
+            solution = data["solution"]
+            success_list = []
+            n = len(responses)
+            for i in range(n):
+                response = responses[i]
+                succeed = is_equiv_chain_of_thought(response, solution)
+                success_list.append(succeed)
+            return {
+                "expected_success": 1 - pow(1 - sum(success_list) / n, n),
+                "success": any(s for s in success_list),
+            }
+        ```
 
             log_file_name (str, optional): The log file.
             inference_budget (float, optional): The inference budget.
diff --git a/notebook/integrate_chatgpt.ipynb b/notebook/integrate_chatgpt.ipynb
index 5495b37c78..3740cb783e 100644
--- a/notebook/integrate_chatgpt.ipynb
+++ b/notebook/integrate_chatgpt.ipynb
@@ -67,7 +67,9 @@
     "import os\n",
     "\n",
     "if \"OPENAI_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\"\n",
+    "import openai\n",
+    "openai.api_key_path = \"key.txt\""
    ]
   },
   {
@@ -93,7 +95,7 @@
    "source": [
     "# openai.api_type = \"azure\"\n",
     "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
-    "# openai.api_version = \"2022-12-01\""
+    "# openai.api_version = \"2023-03-15-preview\""
    ]
   },
   {
@@ -108,7 +110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 4,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:40:52.339977Z",
@@ -129,7 +131,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8358c4bf9cc44b99916c9b6cb1e3a279",
+       "model_id": "764a0e25b95047bd99ad2c7b798cb472",
        "version_major": 2,
        "version_minor": 0
       },
@@ -195,7 +197,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 5,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:40:54.607152Z",
@@ -231,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 6,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:40:54.613590Z",
@@ -260,12 +262,12 @@
    "source": [
     "## Define Success Metric\n",
     "\n",
-    "Before we start tuning, we need to define the success metric we want to opotimize. For each math task, we use voting to select a response with the most common answers out of all the generated responses. If it has an equivalent answer to the canonical solution, we consider the task as successfully solved. Then we can optimize the mean success rate of a collection of tasks."
+    "Before we start tuning, we need to define the success metric we want to optimize. For each math task, we use voting to select a response with the most common answers out of all the generated responses. If it has an equivalent answer to the canonical solution, we consider the task as successfully solved. Then we can optimize the mean success rate of a collection of tasks."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 7,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:40:54.626998Z",
@@ -276,310 +278,7 @@
    },
    "outputs": [],
    "source": [
-    "from typing import Optional\n",
-    "\n",
-    "def remove_boxed(string: str) -> Optional[str]:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Extract the text within a \\\\boxed{...} environment.\n",
-    "    Example:\n",
-    "    >>> remove_boxed(\\\\boxed{\\\\frac{2}{3}})\n",
-    "    \\\\frac{2}{3}\n",
-    "    \"\"\"\n",
-    "    left = \"\\\\boxed{\"\n",
-    "    try:\n",
-    "        assert string[: len(left)] == left\n",
-    "        assert string[-1] == \"}\"\n",
-    "        return string[len(left) : -1]\n",
-    "    except Exception:\n",
-    "        return None\n",
-    "\n",
-    "\n",
-    "def last_boxed_only_string(string: str) -> Optional[str]:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Extract the last \\\\boxed{...} or \\\\fbox{...} element from a string.\n",
-    "    \"\"\"\n",
-    "    idx = string.rfind(\"\\\\boxed\")\n",
-    "    if idx < 0:\n",
-    "        idx = string.rfind(\"\\\\fbox\")\n",
-    "        if idx < 0:\n",
-    "            return None\n",
-    "\n",
-    "    i = idx\n",
-    "    right_brace_idx = None\n",
-    "    num_left_braces_open = 0\n",
-    "    while i < len(string):\n",
-    "        if string[i] == \"{\":\n",
-    "            num_left_braces_open += 1\n",
-    "        if string[i] == \"}\":\n",
-    "            num_left_braces_open -= 1\n",
-    "            if num_left_braces_open == 0:\n",
-    "                right_brace_idx = i\n",
-    "                break\n",
-    "        i += 1\n",
-    "\n",
-    "    if right_brace_idx is None:\n",
-    "        retval = None\n",
-    "    else:\n",
-    "        retval = string[idx : right_brace_idx + 1]\n",
-    "\n",
-    "    return retval\n",
-    "\n",
-    "\n",
-    "def _fix_fracs(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Reformat fractions.\n",
-    "    Examples:\n",
-    "    >>> _fix_fracs(\"\\\\frac1b\")\n",
-    "    \\frac{1}{b}\n",
-    "    >>> _fix_fracs(\"\\\\frac12\")\n",
-    "    \\frac{1}{2}\n",
-    "    >>> _fix_fracs(\"\\\\frac1{72}\")\n",
-    "    \\frac{1}{72}\n",
-    "    \"\"\"\n",
-    "    substrs = string.split(\"\\\\frac\")\n",
-    "    new_str = substrs[0]\n",
-    "    if len(substrs) > 1:\n",
-    "        substrs = substrs[1:]\n",
-    "        for substr in substrs:\n",
-    "            new_str += \"\\\\frac\"\n",
-    "            if substr[0] == \"{\":\n",
-    "                new_str += substr\n",
-    "            else:\n",
-    "                try:\n",
-    "                    assert len(substr) >= 2\n",
-    "                except Exception:\n",
-    "                    return string\n",
-    "                a = substr[0]\n",
-    "                b = substr[1]\n",
-    "                if b != \"{\":\n",
-    "                    if len(substr) > 2:\n",
-    "                        post_substr = substr[2:]\n",
-    "                        new_str += \"{\" + a + \"}{\" + b + \"}\" + post_substr\n",
-    "                    else:\n",
-    "                        new_str += \"{\" + a + \"}{\" + b + \"}\"\n",
-    "                else:\n",
-    "                    if len(substr) > 2:\n",
-    "                        post_substr = substr[2:]\n",
-    "                        new_str += \"{\" + a + \"}\" + b + post_substr\n",
-    "                    else:\n",
-    "                        new_str += \"{\" + a + \"}\" + b\n",
-    "    string = new_str\n",
-    "    return string\n",
-    "\n",
-    "\n",
-    "def _fix_a_slash_b(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Reformat fractions formatted as a/b to \\\\frac{a}{b}.\n",
-    "    Example:\n",
-    "    >>> _fix_a_slash_b(\"2/3\")\n",
-    "    \\frac{2}{3}\n",
-    "    \"\"\"\n",
-    "    if len(string.split(\"/\")) != 2:\n",
-    "        return string\n",
-    "    a_str = string.split(\"/\")[0]\n",
-    "    b_str = string.split(\"/\")[1]\n",
-    "    try:\n",
-    "        a = int(a_str)\n",
-    "        b = int(b_str)\n",
-    "        assert string == \"{}/{}\".format(a, b)\n",
-    "        new_string = \"\\\\frac{\" + str(a) + \"}{\" + str(b) + \"}\"\n",
-    "        return new_string\n",
-    "    except Exception:\n",
-    "        return string\n",
-    "\n",
-    "\n",
-    "def _remove_right_units(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Remove units (on the right).\n",
-    "    \"\\\\text{ \" only ever occurs (at least in the val set) when describing units.\n",
-    "    \"\"\"\n",
-    "    if \"\\\\text{ \" in string:\n",
-    "        splits = string.split(\"\\\\text{ \")\n",
-    "        assert len(splits) == 2\n",
-    "        return splits[0]\n",
-    "    else:\n",
-    "        return string\n",
-    "\n",
-    "\n",
-    "def _fix_sqrt(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Reformat square roots.\n",
-    "    Example:\n",
-    "    >>> _fix_sqrt(\"\\\\sqrt3\")\n",
-    "    \\sqrt{3}\n",
-    "    \"\"\"\n",
-    "    if \"\\\\sqrt\" not in string:\n",
-    "        return string\n",
-    "    splits = string.split(\"\\\\sqrt\")\n",
-    "    new_string = splits[0]\n",
-    "    for split in splits[1:]:\n",
-    "        if split[0] != \"{\":\n",
-    "            a = split[0]\n",
-    "            new_substr = \"\\\\sqrt{\" + a + \"}\" + split[1:]\n",
-    "        else:\n",
-    "            new_substr = \"\\\\sqrt\" + split\n",
-    "        new_string += new_substr\n",
-    "    return new_string\n",
-    "\n",
-    "\n",
-    "def _strip_string(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Apply the reformatting helper functions above.\n",
-    "    \"\"\"\n",
-    "    # linebreaks\n",
-    "    string = string.replace(\"\\n\", \"\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # remove inverse spaces\n",
-    "    string = string.replace(\"\\\\!\", \"\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # replace \\\\ with \\\n",
-    "    string = string.replace(\"\\\\\\\\\", \"\\\\\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # replace tfrac and dfrac with frac\n",
-    "    string = string.replace(\"tfrac\", \"frac\")\n",
-    "    string = string.replace(\"dfrac\", \"frac\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # remove \\left and \\right\n",
-    "    string = string.replace(\"\\\\left\", \"\")\n",
-    "    string = string.replace(\"\\\\right\", \"\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # Remove circ (degrees)\n",
-    "    string = string.replace(\"^{\\\\circ}\", \"\")\n",
-    "    string = string.replace(\"^\\\\circ\", \"\")\n",
-    "\n",
-    "    # remove dollar signs\n",
-    "    string = string.replace(\"\\\\$\", \"\")\n",
-    "\n",
-    "    # remove units (on the right)\n",
-    "    string = _remove_right_units(string)\n",
-    "\n",
-    "    # remove percentage\n",
-    "    string = string.replace(\"\\\\%\", \"\")\n",
-    "    string = string.replace(\"\\%\", \"\")\n",
-    "\n",
-    "    # \" 0.\" equivalent to \" .\" and \"{0.\" equivalent to \"{.\" Alternatively, add \"0\" if \".\" is the start of the string\n",
-    "    string = string.replace(\" .\", \" 0.\")\n",
-    "    string = string.replace(\"{.\", \"{0.\")\n",
-    "    # if empty, return empty string\n",
-    "    if len(string) == 0:\n",
-    "        return string\n",
-    "    if string[0] == \".\":\n",
-    "        string = \"0\" + string\n",
-    "\n",
-    "    # to consider: get rid of e.g. \"k = \" or \"q = \" at beginning\n",
-    "    if len(string.split(\"=\")) == 2:\n",
-    "        if len(string.split(\"=\")[0]) <= 2:\n",
-    "            string = string.split(\"=\")[1]\n",
-    "\n",
-    "    # fix sqrt3 --> sqrt{3}\n",
-    "    string = _fix_sqrt(string)\n",
-    "\n",
-    "    # remove spaces\n",
-    "    string = string.replace(\" \", \"\")\n",
-    "\n",
-    "    # \\frac1b or \\frac12 --> \\frac{1}{b} and \\frac{1}{2}, etc.\n",
-    "    # Even works with \\frac1{72} (but not \\frac{72}1).\n",
-    "    # Also does a/b --> \\\\frac{a}{b}\n",
-    "    string = _fix_fracs(string)\n",
-    "\n",
-    "    # manually change 0.5 --> \\frac{1}{2}\n",
-    "    if string == \"0.5\":\n",
-    "        string = \"\\\\frac{1}{2}\"\n",
-    "\n",
-    "    # NOTE: X/Y changed to \\frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y\n",
-    "    string = _fix_a_slash_b(string)\n",
-    "\n",
-    "    return string\n",
-    "\n",
-    "\n",
-    "def get_answer(solution: Optional[str]) -> Optional[str]:\n",
-    "    if solution is None:\n",
-    "        return None\n",
-    "    last_boxed = last_boxed_only_string(solution)\n",
-    "    if last_boxed is None:\n",
-    "        return None\n",
-    "    answer = remove_boxed(last_boxed)\n",
-    "    if answer is None:\n",
-    "        return None\n",
-    "    return answer\n",
-    "\n",
-    "\n",
-    "def is_equiv(str1: Optional[str], str2: Optional[str]) -> float:\n",
-    "    \"\"\"Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in\n",
-    "    - units\n",
-    "    - fractions\n",
-    "    - square roots\n",
-    "    - superfluous LaTeX.\n",
-    "    Source: https://github.com/hendrycks/math\n",
-    "    \"\"\"\n",
-    "    if str1 is None and str2 is None:\n",
-    "        print(\"WARNING: Both None\")\n",
-    "        return 1.0\n",
-    "    if str1 is None or str2 is None:\n",
-    "        return 0.0\n",
-    "\n",
-    "    try:\n",
-    "        ss1 = _strip_string(str1)\n",
-    "        ss2 = _strip_string(str2)\n",
-    "        return float(ss1 == ss2)\n",
-    "    except Exception:\n",
-    "        return float(str1 == str2)\n",
-    "\n",
-    "\n",
-    "def is_equiv_chain_of_thought(str1: str, str2: str) -> float:\n",
-    "    \"\"\"Strips the solution first before calling `is_equiv`.\"\"\"\n",
-    "    ans1 = get_answer(str1)\n",
-    "    ans2 = get_answer(str2)\n",
-    "\n",
-    "    return is_equiv(ans1, ans2)\n",
-    "\n",
-    "\n",
-    "def success_metrics(responses, solution, **args):\n",
-    "    \"\"\"Check if each response is correct.\n",
-    "    \n",
-    "    Args:\n",
-    "        responses (list): The list of responses.\n",
-    "        solution (str): The canonical solution.\n",
-    "    \n",
-    "    Returns:\n",
-    "        dict: The success metrics.\n",
-    "    \"\"\"\n",
-    "    success_list = []\n",
-    "    n = len(responses)\n",
-    "    for i in range(n):\n",
-    "        response = responses[i]\n",
-    "        succeed = is_equiv_chain_of_thought(response, solution)\n",
-    "        success_list.append(succeed)\n",
-    "    # voting\n",
-    "    answers = {}\n",
-    "    for i in range(n):\n",
-    "        equiv = i\n",
-    "        if get_answer(responses[i]) is None:\n",
-    "            # ignore None answers\n",
-    "            continue\n",
-    "        for j in answers:\n",
-    "            if is_equiv_chain_of_thought(responses[i], responses[j]):\n",
-    "                equiv = j\n",
-    "                break\n",
-    "        if equiv in answers:\n",
-    "            answers[equiv] += 1\n",
-    "        else:\n",
-    "            answers[equiv] = 1\n",
-    "    # find the answer with highest votes in answers\n",
-    "    answer = max(answers.items(), key=lambda x: x[1], default=(0, 0))[0]\n",
-    "    # check if the answer is correct\n",
-    "    success_vote = is_equiv_chain_of_thought(responses[answer], solution)\n",
-    "    return {\n",
-    "        \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n",
-    "        \"success\": any(s for s in success_list),\n",
-    "        \"success_vote\": success_vote,\n",
-    "        \"voted_answer\": responses[answer],\n",
-    "    }\n"
+    "from flaml.autogen.math_utils import success_metrics"
    ]
   },
   {
@@ -600,7 +299,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 8,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:40:54.634335Z",
@@ -627,7 +326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 9,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:40:56.109177Z",
@@ -685,7 +384,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 10,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:40:56.115383Z",
@@ -699,31 +398,37 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m[I 2023-03-26 04:03:37,074]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
-      "\u001b[32m[I 2023-03-26 04:03:37,077]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+      "\u001b[32m[I 2023-03-29 22:15:13,167]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
+      "\u001b[32m[I 2023-03-29 22:15:13,169]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 1 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.9, 'success': 0.9, 'success_vote': 0.9, 'voted_answer': 'We use the distance formula to find the distance between the two points: $\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{3^2+(-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=\\\\boxed{5}$.', 'total_cost': 0.13772999999999996, 'cost': 0.13772999999999996, 'inference_cost': 0.0068864999999999985, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.004978179931640625}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'success_vote': 0.8, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$$ Letting $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0)$, we have: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5$$ Therefore, the distance between the points $(0,4)$ and $(3,0)$ is $\\\\boxed{5}$.', 'total_cost': 0.145722, 'cost': 0.007992, 'inference_cost': 0.00039759999999999996, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0047664642333984375}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.5140933870421127, 'success': 0.55, 'success_vote': 0.5, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5.$$ Therefore, the distance between the points (0,4) and (3,0) is $\\\\boxed{5}$.', 'total_cost': 0.21644799999999997, 'cost': 0.07072600000000001, 'inference_cost': 0.0035343, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.4985070123025904}, 'config/max_tokens': 97, 'config/n': 20, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.010622501373291016}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.3386014997741698, 'success': 0.4, 'success_vote': 0.35, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=\\\\boxed{5}.$$', 'total_cost': 0.3192479999999999, 'cost': 0.10279999999999999, 'inference_cost': 0.005138, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9533933461949365}, 'config/max_tokens': 50, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.015543699264526367}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.9999998720099207, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2},$$ where $(x_1,y_1)$ and $(x_2,y_2)$ are the given points. Plugging in the values $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0),$ we have: $$\\\\sqrt{(3-0)^2+(0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25}.$$ Therefore, the distance between the two points is $\\\\boxed{5}$.', 'total_cost': 0.6322379999999999, 'cost': 0.31299, 'inference_cost': 0.015323400000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.05237627029418945}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 6 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'success_vote': 0, 'total_cost': 0.7246679999999999, 'cost': 0.09243, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.4340139933332937}, 'config/max_tokens': 317, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.001924753189086914}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.7572581384563789, 'success': 0.8, 'success_vote': 0.8, 'voted_answer': 'We use the distance formula: \\\\begin{align*}\\n\\\\text{distance}&=\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}\\\\\\\\\\n&=\\\\sqrt{(3-0)^2+(0-4)^2}\\\\\\\\\\n&=\\\\sqrt{9+16}\\\\\\\\\\n&=\\\\sqrt{25}\\\\\\\\\\n&=\\\\boxed{5}.\\n\\\\end{align*}', 'total_cost': 0.7647499999999999, 'cost': 0.04008199999999999, 'inference_cost': 0.0020021, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9086488808086682}, 'config/max_tokens': 129, 'config/n': 9, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.007839441299438477}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'success_vote': 0, 'total_cost': 1.0214359999999998, 'cost': 0.25668599999999997, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6262871483113925}, 'config/max_tokens': 257, 'config/n': 82, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.009511232376098633}\n",
-      "[flaml.tune.tune: 03-26 04:03:37] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
+      "[flaml.tune.tune: 03-29 22:15:13] {832} INFO - trial 1 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:19:26] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'success_vote': 0.8, 'voted_answer': 'We use the distance formula to find the distance between the points $(0,4)$ and $(3,0)$: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{3^2+(-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=\\\\boxed{5}.$$', 'total_cost': 0.14595, 'cost': 0.14595, 'inference_cost': 0.007297499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 252.8926329612732}\n",
+      "[flaml.tune.tune: 03-29 22:19:26] {832} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:20:40] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'success_vote': 0.75, 'voted_answer': 'Using the distance formula, we have:\\n\\n$$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5$$\\n\\nTherefore, the distance between the points (0,4) and (3,0) is $\\\\boxed{5}$.', 'total_cost': 0.153144, 'cost': 0.007194, 'inference_cost': 0.0003577, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 73.90449404716492}\n",
+      "[flaml.tune.tune: 03-29 22:20:40] {832} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:21:38] {215} INFO - result: {'expected_success': 0.5234643092959356, 'success': 0.55, 'success_vote': 0.45, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$$ Letting $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0)$, we have: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5$$ Therefore, the distance between the points', 'total_cost': 0.22482, 'cost': 0.071676, 'inference_cost': 0.0035818000000000004, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.4985070123025904}, 'config/max_tokens': 97, 'config/n': 20, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 57.6577262878418}\n",
+      "[flaml.tune.tune: 03-29 22:21:38] {832} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:22:17] {215} INFO - result: {'expected_success': 0.3703632292361099, 'success': 0.45, 'success_vote': 0.45, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(3-0)^2+(0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25} = \\\\boxed{5}.$$', 'total_cost': 0.3276559999999999, 'cost': 0.10283600000000002, 'inference_cost': 0.0051398, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9533933461949365}, 'config/max_tokens': 50, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 39.326674938201904}\n",
+      "[flaml.tune.tune: 03-29 22:22:17] {832} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:25:44] {215} INFO - result: {'expected_success': 0.9999999951313461, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'Using the distance formula, we have \\\\begin{align*}\\n\\\\text{distance} &= \\\\sqrt{(3-0)^2 + (0-4)^2} \\\\\\\\\\n&= \\\\sqrt{9+16} \\\\\\\\\\n&= \\\\sqrt{25} \\\\\\\\\\n&= \\\\boxed{5}.\\n\\\\end{align*}', 'total_cost': 0.6470319999999998, 'cost': 0.319376, 'inference_cost': 0.0153156, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 207.07495784759521}\n",
+      "[flaml.tune.tune: 03-29 22:25:44] {832} INFO - trial 6 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:26:12] {215} INFO - result: {'success_vote': 0, 'total_cost': 0.7075719999999999, 'cost': 0.060540000000000004, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.4340139933332937}, 'config/max_tokens': 317, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 27.423347234725952}\n",
+      "[flaml.tune.tune: 03-29 22:26:12] {832} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:27:27] {215} INFO - result: {'expected_success': 0.8195010292808753, 'success': 0.9, 'success_vote': 0.85, 'voted_answer': 'Using the distance formula, we have:\\n\\n$\\\\sqrt{(3-0)^2 + (0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25} = \\\\boxed{5}$.', 'total_cost': 0.7473699999999996, 'cost': 0.039798, 'inference_cost': 0.0019879, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9086488808086682}, 'config/max_tokens': 129, 'config/n': 9, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 75.42242240905762}\n",
+      "[flaml.tune.tune: 03-29 22:27:27] {832} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:28:32] {215} INFO - result: {'success_vote': 0, 'total_cost': 0.9240679999999994, 'cost': 0.176698, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6262871483113925}, 'config/max_tokens': 257, 'config/n': 82, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 65.16396474838257}\n",
+      "[flaml.tune.tune: 03-29 22:28:32] {832} INFO - trial 9 config: {'model': 'gpt-4', 'temperature_or_top_p': {'top_p': 0.2959617068796787}, 'max_tokens': 705, 'n': 33, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:28:32] {215} INFO - result: {'inference_cost': inf, 'success_vote': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'top_p': 0.2959617068796787}, 'max_tokens': 705, 'n': 33, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'top_p': 0.2959617068796787}, 'config/max_tokens': 705, 'config/n': 33, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005254745483398438}\n",
+      "[flaml.tune.tune: 03-29 22:28:32] {832} INFO - trial 10 config: {'model': 'gpt-4', 'temperature_or_top_p': {'top_p': 0.15115201964256386}, 'max_tokens': 157, 'n': 94, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:28:42] {215} INFO - result: {'success_vote': 0, 'total_cost': 0.9455479999999994, 'cost': 0.021479999999999996, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'top_p': 0.15115201964256386}, 'max_tokens': 157, 'n': 94, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'top_p': 0.15115201964256386}, 'config/max_tokens': 157, 'config/n': 94, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 10.173765182495117}\n",
+      "[flaml.tune.tune: 03-29 22:28:42] {832} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5973716482308843}, 'max_tokens': 746, 'n': 53, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-29 22:29:38] {215} INFO - result: {'success_vote': 0, 'total_cost': 1.0220939999999994, 'cost': 0.076546, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5973716482308843}, 'max_tokens': 746, 'n': 53, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5973716482308843}, 'config/max_tokens': 746, 'config/n': 53, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 56.077386140823364}\n",
+      "[flaml.tune.tune: 03-29 22:29:38] {855} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
      ]
     }
    ],
@@ -737,7 +442,7 @@
     "    mode=\"max\",  # the optimization mode\n",
     "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
     "    # log_file_name=\"logs/math.log\",  # the log file name\n",
-    "    inference_budget=0.03,  # the inference budget (dollar)\n",
+    "    inference_budget=0.02,  # the inference budget (dollar)\n",
     "    optimization_budget=1,  # the optimization budget (dollar)\n",
     "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
     "    # -1 means decided by the optimization budget only\n",
@@ -762,7 +467,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 11,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:41:55.049204Z",
@@ -777,7 +482,7 @@
      "output_type": "stream",
      "text": [
       "optimized config {'model': 'gpt-3.5-turbo', 'max_tokens': 424, 'n': 54, 'prompt': '{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.', 'stop': None, 'temperature': 0.9177741225129434}\n",
-      "best result on tuning data {'expected_success': 0.9999998720099207, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2},$$ where $(x_1,y_1)$ and $(x_2,y_2)$ are the given points. Plugging in the values $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0),$ we have: $$\\\\sqrt{(3-0)^2+(0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25}.$$ Therefore, the distance between the two points is $\\\\boxed{5}$.', 'total_cost': 0.6322379999999999, 'cost': 0.31299, 'inference_cost': 0.015323400000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.05237627029418945}\n"
+      "best result on tuning data {'expected_success': 0.9999999951313461, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'Using the distance formula, we have \\\\begin{align*}\\n\\\\text{distance} &= \\\\sqrt{(3-0)^2 + (0-4)^2} \\\\\\\\\\n&= \\\\sqrt{9+16} \\\\\\\\\\n&= \\\\sqrt{25} \\\\\\\\\\n&= \\\\boxed{5}.\\n\\\\end{align*}', 'total_cost': 0.6470319999999998, 'cost': 0.319376, 'inference_cost': 0.0153156, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 207.07495784759521}\n"
      ]
     }
    ],
@@ -802,7 +507,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 12,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:41:55.056205Z",
@@ -1272,7 +977,7 @@
    "source": [
     "responses = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
     "print(responses)\n",
-    "print(success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
+    "print(success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))"
    ]
   },
   {
@@ -1287,7 +992,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 13,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-13T23:41:56.042764Z",
@@ -1301,14 +1006,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'expected_success': 0.9878128576084944, 'success': 0.9950248756218906, 'success_vote': 0.9203980099502488, 'voted_answer': \"We have that $1$ kilowatt is equivalent to $1.36$ horsepower. Therefore, we can set up the proportion $\\\\frac{1\\\\text{ kW}}{1.36\\\\text{ hp}} = \\\\frac{x\\\\text{ kW}}{500\\\\text{ hp}}$, where $x$ is the number of kilowatts that Eric's car's engine can generate. Solving for $x$, we get $x = \\\\frac{(1\\\\text{ kW})(500\\\\text{ hp})}{1.36\\\\text{ hp}} \\\\approx \\\\boxed{368 \\\\text{ kW}}$.\", 'total_cost': 4.194939999999996, 'cost': 3.1735039999999994, 'inference_cost': 0.01577204825870647}\n"
+      "{'expected_success': 0.9889659482835421, 'success': 0.9950248756218906, 'success_vote': 0.9154228855721394, 'voted_answer': \"We can start by setting up a proportion: $$\\\\frac{\\\\text{power in kilowatts}}{\\\\text{power in metric horsepower}}=\\\\frac{1}{1.36}$$ Let $P$ be the power in kilowatts. Then, we can write: $$\\\\frac{P}{500}= \\\\frac{1}{1.36}$$ Cross-multiplying, we have: $$P=\\\\frac{500}{1.36} \\\\approx \\\\boxed{368}$$ Therefore, Eric's car's engine can generate approximately $\\\\boxed{368}$ kilowatts of power.\", 'total_cost': 4.208937999999998, 'cost': 3.186843999999999, 'inference_cost': 0.015838416417910447}\n"
      ]
     }
    ],
    "source": [
-    "# oai.ChatCompletion.data = test_data\n",
-    "# result = oai.ChatCompletion.eval(analysis.best_config, prune=False, eval_only=True)\n",
-    "# print(result)"
+    "oai.ChatCompletion.data = test_data\n",
+    "result = oai.ChatCompletion.eval(analysis.best_config, prune=False, eval_only=True)\n",
+    "print(result)"
    ]
   },
   {
@@ -1321,14 +1026,184 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "retrying in 10 seconds...\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 669, in _interpret_response_line\n",
+      "    data = json.loads(rbody)\n",
+      "  File \"/usr/local/lib/python3.9/json/__init__.py\", line 346, in loads\n",
+      "    return _default_decoder.decode(s)\n",
+      "  File \"/usr/local/lib/python3.9/json/decoder.py\", line 337, in decode\n",
+      "    obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n",
+      "  File \"/usr/local/lib/python3.9/json/decoder.py\", line 355, in raw_decode\n",
+      "    raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n",
+      "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n",
+      "\n",
+      "The above exception was the direct cause of the following exception:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/workspaces/FLAML/flaml/integrations/oai/completion.py\", line 139, in _get_response\n",
+      "    response = openai_completion.create(**config)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py\", line 25, in create\n",
+      "    return super().create(*args, **kwargs)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py\", line 153, in create\n",
+      "    response, _, api_key = requestor.request(\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 226, in request\n",
+      "    resp, got_stream = self._interpret_response(result, stream)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 619, in _interpret_response\n",
+      "    self._interpret_response_line(\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 671, in _interpret_response_line\n",
+      "    raise error.APIError(\n",
+      "openai.error.APIError: HTTP code 524 from API (<!DOCTYPE html>\n",
+      "<!--[if lt IE 7]> <html class=\"no-js ie6 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if IE 7]>    <html class=\"no-js ie7 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if IE 8]>    <html class=\"no-js ie8 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if gt IE 8]><!--> <html class=\"no-js\" lang=\"en-US\"> <!--<![endif]-->\n",
+      "<head>\n",
+      "\n",
+      "\n",
+      "<title>api.openai.com | 524: A timeout occurred</title>\n",
+      "<meta charset=\"UTF-8\" />\n",
+      "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n",
+      "<meta http-equiv=\"X-UA-Compatible\" content=\"IE=Edge\" />\n",
+      "<meta name=\"robots\" content=\"noindex, nofollow\" />\n",
+      "<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\" />\n",
+      "<link rel=\"stylesheet\" id=\"cf_styles-css\" href=\"/cdn-cgi/styles/main.css\" />\n",
+      "\n",
+      "\n",
+      "</head>\n",
+      "<body>\n",
+      "<div id=\"cf-wrapper\">\n",
+      "\n",
+      "    \n",
+      "\n",
+      "    <div id=\"cf-error-details\" class=\"p-0\">\n",
+      "        <header class=\"mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8\">\n",
+      "            <h1 class=\"inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2\">\n",
+      "              \n",
+      "              <span class=\"inline-block\">A timeout occurred</span>\n",
+      "              <span class=\"code-label\">Error code 524</span>\n",
+      "            </h1>\n",
+      "            <div>\n",
+      "               Visit <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">cloudflare.com</a> for more information.\n",
+      "            </div>\n",
+      "            <div class=\"mt-3\">2023-03-29 23:14:37 UTC</div>\n",
+      "        </header>\n",
+      "        \n",
+      "        <div class=\"my-8 bg-gradient-gray\">\n",
+      "            <div class=\"w-240 lg:w-full mx-auto\">\n",
+      "                <div class=\"clearfix md:px-8\">\n",
+      "                  \n",
+      "<div id=\"cf-browser-status\" class=\" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    \n",
+      "    <span class=\"cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    \n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">You</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    \n",
+      "    Browser\n",
+      "    \n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-green-success\">Working</span>\n",
+      "</div>\n",
+      "\n",
+      "<div id=\"cf-cloudflare-status\" class=\" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">\n",
+      "    <span class=\"cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    </a>\n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">Chicago</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">\n",
+      "    Cloudflare\n",
+      "    </a>\n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-green-success\">Working</span>\n",
+      "</div>\n",
+      "\n",
+      "<div id=\"cf-host-status\" class=\"cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    \n",
+      "    <span class=\"cf-icon-server block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    \n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">api.openai.com</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    \n",
+      "    Host\n",
+      "    \n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-red-error\">Error</span>\n",
+      "</div>\n",
+      "\n",
+      "                </div>\n",
+      "              \n",
+      "            </div>\n",
+      "        </div>\n",
+      "\n",
+      "        <div class=\"w-240 lg:w-full mx-auto mb-8 lg:px-8\">\n",
+      "            <div class=\"clearfix\">\n",
+      "                <div class=\"w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed\">\n",
+      "                    <h2 class=\"text-3xl font-normal leading-1.3 mb-4\">What happened?</h2>\n",
+      "                    <p>The origin web server timed out responding to this request.</p>\n",
+      "                </div>\n",
+      "              \n",
+      "                <div class=\"w-1/2 md:w-full float-left leading-relaxed\">\n",
+      "                    <h2 class=\"text-3xl font-normal leading-1.3 mb-4\">What can I do?</h2>\n",
+      "                          <h3 class=\"text-15 font-semibold mb-2\">If you're a visitor of this website:</h3>\n",
+      "      <p class=\"mb-6\">Please try again in a few minutes.</p>\n",
+      "\n",
+      "      <h3 class=\"text-15 font-semibold mb-2\">If you're the owner of this website:</h3>\n",
+      "      <p><span>The connection to the origin web server was made, but the origin web server timed out before responding. The likely cause is an overloaded background task, database or application, stressing the resources on your web server. To resolve, please work with your hosting provider or web development team to free up resources for your database or overloaded application.</span> <a rel=\"noopener noreferrer\" href=\"https://support.cloudflare.com/hc/en-us/articles/200171926-Error-524\">Additional troubleshooting information here.</a></p>\n",
+      "                </div>\n",
+      "            </div>\n",
+      "              \n",
+      "        </div>\n",
+      "\n",
+      "        <div class=\"cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300\">\n",
+      "  <p class=\"text-13\">\n",
+      "    <span class=\"cf-footer-item sm:block sm:mb-1\">Cloudflare Ray ID: <strong class=\"font-semibold\">7afbcd004eb8e252</strong></span>\n",
+      "    <span class=\"cf-footer-separator sm:hidden\">&bull;</span>\n",
+      "    <span id=\"cf-footer-item-ip\" class=\"cf-footer-item hidden sm:block sm:mb-1\">\n",
+      "      Your IP:\n",
+      "      <button type=\"button\" id=\"cf-footer-ip-reveal\" class=\"cf-footer-ip-reveal-btn\">Click to reveal</button>\n",
+      "      <span class=\"hidden\" id=\"cf-footer-ip\">20.241.3.188</span>\n",
+      "      <span class=\"cf-footer-separator sm:hidden\">&bull;</span>\n",
+      "    </span>\n",
+      "    <span class=\"cf-footer-item sm:block sm:mb-1\"><span>Performance &amp; security by</span> <a rel=\"noopener noreferrer\" href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" id=\"brand_link\" target=\"_blank\">Cloudflare</a></span>\n",
+      "    \n",
+      "  </p>\n",
+      "  <script>(function(){function d(){var b=a.getElementById(\"cf-footer-item-ip\"),c=a.getElementById(\"cf-footer-ip-reveal\");b&&\"classList\"in b&&(b.classList.remove(\"hidden\"),c.addEventListener(\"click\",function(){c.classList.add(\"hidden\");a.getElementById(\"cf-footer-ip\").classList.remove(\"hidden\")}))}var a=document;document.addEventListener&&a.addEventListener(\"DOMContentLoaded\",d)})();</script>\n",
+      "</div><!-- /.error-footer -->\n",
+      "\n",
+      "\n",
+      "    </div>\n",
+      "</div>\n",
+      "</body>\n",
+      "</html>\n",
+      "\n",
+      ")\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'expected_success': 0.6965174129353234, 'success': 0.6965174129353234, 'success_vote': 0.6965174129353234, 'voted_answer': \"If we let $x$ be the number of kilowatts, then we can set up the proportion $\\\\frac{x\\\\text{ kW}}{500\\\\text{ hp}}=\\\\frac{1\\\\text{ kW}}{1.36\\\\text{ hp}}$. Solving for $x$, we get $x=\\\\frac{500}{1.36} = 367.65$. Rounding to the nearest integer, we get that Eric's car's engine has $\\\\boxed{368}$ kilowatts.\", 'total_cost': 6.009489999999993, 'cost': 1.8145500000000006, 'inference_cost': 0.008809679104477611}\n"
+      "{'expected_success': 0.7213930348258707, 'success': 0.7213930348258707, 'success_vote': 0.7213930348258707, 'voted_answer': \"First, convert the $500$ horsepower to kilowatts by dividing by $1.36$. $500/1.46 \\\\approx 360.5$. Therefore Eric's car can generate about $\\\\boxed{361}$ kilowatts of power.\", 'total_cost': 6.099327999999994, 'cost': 1.8903900000000013, 'inference_cost': 0.009186992537313433}\n",
+      "default config of GPT-4 succeeds in 72.1% test cases\n",
+      "tuned config succeeds in 91.5% test cases\n"
      ]
     }
    ],
@@ -1336,28 +1211,208 @@
     "# assuming you have access to gpt-4; otherwise use gpt-3.5-turbo\n",
     "# the following code will cost roughly $2 if uncommented and run.\n",
     "\n",
-    "# default_config = {\"model\": 'gpt-4', \"prompt\": 0}\n",
-    "# default_result = oai.ChatCompletion.eval(default_config, prune=False, eval_only=True)\n",
-    "# print(default_result)"
+    "default_config = {\"model\": 'gpt-4', \"prompt\": 0}\n",
+    "default_result = oai.ChatCompletion.eval(default_config, prune=False, eval_only=True)\n",
+    "print(default_result)\n",
+    "print(\"default config of GPT-4 succeeds in {:.1f}% test cases\".format(default_result[\"success_vote\"] * 100))\n",
+    "print(\"tuned config succeeds in {:.1f}% test cases\".format(result[\"success_vote\"] * 100))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The default use of GPT-4 has a much lower accuracy. Note that the default config has a lower inference cost. What if we heuristically increase the number of responses n?"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "retrying in 10 seconds...\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 669, in _interpret_response_line\n",
+      "    data = json.loads(rbody)\n",
+      "  File \"/usr/local/lib/python3.9/json/__init__.py\", line 346, in loads\n",
+      "    return _default_decoder.decode(s)\n",
+      "  File \"/usr/local/lib/python3.9/json/decoder.py\", line 337, in decode\n",
+      "    obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n",
+      "  File \"/usr/local/lib/python3.9/json/decoder.py\", line 355, in raw_decode\n",
+      "    raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n",
+      "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n",
+      "\n",
+      "The above exception was the direct cause of the following exception:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/workspaces/FLAML/flaml/integrations/oai/completion.py\", line 139, in _get_response\n",
+      "    response = openai_completion.create(**config)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py\", line 25, in create\n",
+      "    return super().create(*args, **kwargs)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py\", line 153, in create\n",
+      "    response, _, api_key = requestor.request(\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 226, in request\n",
+      "    resp, got_stream = self._interpret_response(result, stream)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 619, in _interpret_response\n",
+      "    self._interpret_response_line(\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 671, in _interpret_response_line\n",
+      "    raise error.APIError(\n",
+      "openai.error.APIError: HTTP code 524 from API (<!DOCTYPE html>\n",
+      "<!--[if lt IE 7]> <html class=\"no-js ie6 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if IE 7]>    <html class=\"no-js ie7 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if IE 8]>    <html class=\"no-js ie8 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if gt IE 8]><!--> <html class=\"no-js\" lang=\"en-US\"> <!--<![endif]-->\n",
+      "<head>\n",
+      "\n",
+      "\n",
+      "<title>api.openai.com | 524: A timeout occurred</title>\n",
+      "<meta charset=\"UTF-8\" />\n",
+      "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n",
+      "<meta http-equiv=\"X-UA-Compatible\" content=\"IE=Edge\" />\n",
+      "<meta name=\"robots\" content=\"noindex, nofollow\" />\n",
+      "<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\" />\n",
+      "<link rel=\"stylesheet\" id=\"cf_styles-css\" href=\"/cdn-cgi/styles/main.css\" />\n",
+      "\n",
+      "\n",
+      "</head>\n",
+      "<body>\n",
+      "<div id=\"cf-wrapper\">\n",
+      "\n",
+      "    \n",
+      "\n",
+      "    <div id=\"cf-error-details\" class=\"p-0\">\n",
+      "        <header class=\"mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8\">\n",
+      "            <h1 class=\"inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2\">\n",
+      "              \n",
+      "              <span class=\"inline-block\">A timeout occurred</span>\n",
+      "              <span class=\"code-label\">Error code 524</span>\n",
+      "            </h1>\n",
+      "            <div>\n",
+      "               Visit <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">cloudflare.com</a> for more information.\n",
+      "            </div>\n",
+      "            <div class=\"mt-3\">2023-03-30 00:12:43 UTC</div>\n",
+      "        </header>\n",
+      "        \n",
+      "        <div class=\"my-8 bg-gradient-gray\">\n",
+      "            <div class=\"w-240 lg:w-full mx-auto\">\n",
+      "                <div class=\"clearfix md:px-8\">\n",
+      "                  \n",
+      "<div id=\"cf-browser-status\" class=\" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    \n",
+      "    <span class=\"cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    \n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">You</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    \n",
+      "    Browser\n",
+      "    \n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-green-success\">Working</span>\n",
+      "</div>\n",
+      "\n",
+      "<div id=\"cf-cloudflare-status\" class=\" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">\n",
+      "    <span class=\"cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    </a>\n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">Chicago</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">\n",
+      "    Cloudflare\n",
+      "    </a>\n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-green-success\">Working</span>\n",
+      "</div>\n",
+      "\n",
+      "<div id=\"cf-host-status\" class=\"cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    \n",
+      "    <span class=\"cf-icon-server block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    \n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">api.openai.com</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    \n",
+      "    Host\n",
+      "    \n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-red-error\">Error</span>\n",
+      "</div>\n",
+      "\n",
+      "                </div>\n",
+      "              \n",
+      "            </div>\n",
+      "        </div>\n",
+      "\n",
+      "        <div class=\"w-240 lg:w-full mx-auto mb-8 lg:px-8\">\n",
+      "            <div class=\"clearfix\">\n",
+      "                <div class=\"w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed\">\n",
+      "                    <h2 class=\"text-3xl font-normal leading-1.3 mb-4\">What happened?</h2>\n",
+      "                    <p>The origin web server timed out responding to this request.</p>\n",
+      "                </div>\n",
+      "              \n",
+      "                <div class=\"w-1/2 md:w-full float-left leading-relaxed\">\n",
+      "                    <h2 class=\"text-3xl font-normal leading-1.3 mb-4\">What can I do?</h2>\n",
+      "                          <h3 class=\"text-15 font-semibold mb-2\">If you're a visitor of this website:</h3>\n",
+      "      <p class=\"mb-6\">Please try again in a few minutes.</p>\n",
+      "\n",
+      "      <h3 class=\"text-15 font-semibold mb-2\">If you're the owner of this website:</h3>\n",
+      "      <p><span>The connection to the origin web server was made, but the origin web server timed out before responding. The likely cause is an overloaded background task, database or application, stressing the resources on your web server. To resolve, please work with your hosting provider or web development team to free up resources for your database or overloaded application.</span> <a rel=\"noopener noreferrer\" href=\"https://support.cloudflare.com/hc/en-us/articles/200171926-Error-524\">Additional troubleshooting information here.</a></p>\n",
+      "                </div>\n",
+      "            </div>\n",
+      "              \n",
+      "        </div>\n",
+      "\n",
+      "        <div class=\"cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300\">\n",
+      "  <p class=\"text-13\">\n",
+      "    <span class=\"cf-footer-item sm:block sm:mb-1\">Cloudflare Ray ID: <strong class=\"font-semibold\">7afc221bbc74e252</strong></span>\n",
+      "    <span class=\"cf-footer-separator sm:hidden\">&bull;</span>\n",
+      "    <span id=\"cf-footer-item-ip\" class=\"cf-footer-item hidden sm:block sm:mb-1\">\n",
+      "      Your IP:\n",
+      "      <button type=\"button\" id=\"cf-footer-ip-reveal\" class=\"cf-footer-ip-reveal-btn\">Click to reveal</button>\n",
+      "      <span class=\"hidden\" id=\"cf-footer-ip\">20.241.3.188</span>\n",
+      "      <span class=\"cf-footer-separator sm:hidden\">&bull;</span>\n",
+      "    </span>\n",
+      "    <span class=\"cf-footer-item sm:block sm:mb-1\"><span>Performance &amp; security by</span> <a rel=\"noopener noreferrer\" href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" id=\"brand_link\" target=\"_blank\">Cloudflare</a></span>\n",
+      "    \n",
+      "  </p>\n",
+      "  <script>(function(){function d(){var b=a.getElementById(\"cf-footer-item-ip\"),c=a.getElementById(\"cf-footer-ip-reveal\");b&&\"classList\"in b&&(b.classList.remove(\"hidden\"),c.addEventListener(\"click\",function(){c.classList.add(\"hidden\");a.getElementById(\"cf-footer-ip\").classList.remove(\"hidden\")}))}var a=document;document.addEventListener&&a.addEventListener(\"DOMContentLoaded\",d)})();</script>\n",
+      "</div><!-- /.error-footer -->\n",
+      "\n",
+      "\n",
+      "    </div>\n",
+      "</div>\n",
+      "</body>\n",
+      "</html>\n",
+      "\n",
+      ")\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tuned config succeeds in 92.0% test cases\n",
-      "untuned config succeeds in 69.7% test cases\n"
+      "{'expected_success': 0.7699004975124378, 'success': 0.835820895522388, 'success_vote': 0.7064676616915423, 'voted_answer': \"Since $1$ kilowatt is equivalent to $1.36$ horsepower, and Eric's sports car's engine has a power of $500$ metric horsepower, his car's engine can generate $\\\\frac{500}{1.36} \\\\approx \\\\boxed{368}$ kilowatts of power.\", 'total_cost': 9.367137999999994, 'cost': 3.267810000000001, 'inference_cost': 0.016039828358208955}\n"
      ]
     }
    ],
    "source": [
-    "# print(\"tuned config succeeds in {:.1f}% test cases\".format(result[\"success_vote\"] * 100))\n",
-    "# print(\"untuned config succeeds in {:.1f}% test cases\".format(default_result[\"success_vote\"] * 100))"
+    "# The following evaluation costs $3 and longer than one hour if you uncomment it and run it.\n",
+    "\n",
+    "config_n2 = {\"model\": 'gpt-4', \"prompt\": 0, \"n\": 2}\n",
+    "n2_result = oai.ChatCompletion.eval(config_n2, prune=False, eval_only=True)\n",
+    "print(n2_result)"
    ]
   },
   {
@@ -1365,28 +1420,196 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that the untuned config has a lower inference cost. What if we heuristically increase the number of responses n to 5?"
+    "The inference cost is doubled and matches the tuned config. But the success rate doesn't improve much. What if we further increase the number of responses n to 5?"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "retrying in 10 seconds...\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 669, in _interpret_response_line\n",
+      "    data = json.loads(rbody)\n",
+      "  File \"/usr/local/lib/python3.9/json/__init__.py\", line 346, in loads\n",
+      "    return _default_decoder.decode(s)\n",
+      "  File \"/usr/local/lib/python3.9/json/decoder.py\", line 337, in decode\n",
+      "    obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n",
+      "  File \"/usr/local/lib/python3.9/json/decoder.py\", line 355, in raw_decode\n",
+      "    raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n",
+      "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n",
+      "\n",
+      "The above exception was the direct cause of the following exception:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/workspaces/FLAML/flaml/integrations/oai/completion.py\", line 139, in _get_response\n",
+      "    response = openai_completion.create(**config)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_resources/chat_completion.py\", line 25, in create\n",
+      "    return super().create(*args, **kwargs)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py\", line 153, in create\n",
+      "    response, _, api_key = requestor.request(\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 226, in request\n",
+      "    resp, got_stream = self._interpret_response(result, stream)\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 619, in _interpret_response\n",
+      "    self._interpret_response_line(\n",
+      "  File \"/home/vscode/.local/lib/python3.9/site-packages/openai/api_requestor.py\", line 671, in _interpret_response_line\n",
+      "    raise error.APIError(\n",
+      "openai.error.APIError: HTTP code 524 from API (<!DOCTYPE html>\n",
+      "<!--[if lt IE 7]> <html class=\"no-js ie6 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if IE 7]>    <html class=\"no-js ie7 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if IE 8]>    <html class=\"no-js ie8 oldie\" lang=\"en-US\"> <![endif]-->\n",
+      "<!--[if gt IE 8]><!--> <html class=\"no-js\" lang=\"en-US\"> <!--<![endif]-->\n",
+      "<head>\n",
+      "\n",
+      "\n",
+      "<title>api.openai.com | 524: A timeout occurred</title>\n",
+      "<meta charset=\"UTF-8\" />\n",
+      "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n",
+      "<meta http-equiv=\"X-UA-Compatible\" content=\"IE=Edge\" />\n",
+      "<meta name=\"robots\" content=\"noindex, nofollow\" />\n",
+      "<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\" />\n",
+      "<link rel=\"stylesheet\" id=\"cf_styles-css\" href=\"/cdn-cgi/styles/main.css\" />\n",
+      "\n",
+      "\n",
+      "</head>\n",
+      "<body>\n",
+      "<div id=\"cf-wrapper\">\n",
+      "\n",
+      "    \n",
+      "\n",
+      "    <div id=\"cf-error-details\" class=\"p-0\">\n",
+      "        <header class=\"mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8\">\n",
+      "            <h1 class=\"inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2\">\n",
+      "              \n",
+      "              <span class=\"inline-block\">A timeout occurred</span>\n",
+      "              <span class=\"code-label\">Error code 524</span>\n",
+      "            </h1>\n",
+      "            <div>\n",
+      "               Visit <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">cloudflare.com</a> for more information.\n",
+      "            </div>\n",
+      "            <div class=\"mt-3\">2023-03-30 02:02:34 UTC</div>\n",
+      "        </header>\n",
+      "        \n",
+      "        <div class=\"my-8 bg-gradient-gray\">\n",
+      "            <div class=\"w-240 lg:w-full mx-auto\">\n",
+      "                <div class=\"clearfix md:px-8\">\n",
+      "                  \n",
+      "<div id=\"cf-browser-status\" class=\" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    \n",
+      "    <span class=\"cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    \n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">You</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    \n",
+      "    Browser\n",
+      "    \n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-green-success\">Working</span>\n",
+      "</div>\n",
+      "\n",
+      "<div id=\"cf-cloudflare-status\" class=\" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">\n",
+      "    <span class=\"cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    </a>\n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">Chicago</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" target=\"_blank\" rel=\"noopener noreferrer\">\n",
+      "    Cloudflare\n",
+      "    </a>\n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-green-success\">Working</span>\n",
+      "</div>\n",
+      "\n",
+      "<div id=\"cf-host-status\" class=\"cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center\">\n",
+      "  <div class=\"relative mb-10 md:m-0\">\n",
+      "    \n",
+      "    <span class=\"cf-icon-server block md:hidden h-20 bg-center bg-no-repeat\"></span>\n",
+      "    <span class=\"cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4\"></span>\n",
+      "    \n",
+      "  </div>\n",
+      "  <span class=\"md:block w-full truncate\">api.openai.com</span>\n",
+      "  <h3 class=\"md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3\">\n",
+      "    \n",
+      "    Host\n",
+      "    \n",
+      "  </h3>\n",
+      "  <span class=\"leading-1.3 text-2xl text-red-error\">Error</span>\n",
+      "</div>\n",
+      "\n",
+      "                </div>\n",
+      "              \n",
+      "            </div>\n",
+      "        </div>\n",
+      "\n",
+      "        <div class=\"w-240 lg:w-full mx-auto mb-8 lg:px-8\">\n",
+      "            <div class=\"clearfix\">\n",
+      "                <div class=\"w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed\">\n",
+      "                    <h2 class=\"text-3xl font-normal leading-1.3 mb-4\">What happened?</h2>\n",
+      "                    <p>The origin web server timed out responding to this request.</p>\n",
+      "                </div>\n",
+      "              \n",
+      "                <div class=\"w-1/2 md:w-full float-left leading-relaxed\">\n",
+      "                    <h2 class=\"text-3xl font-normal leading-1.3 mb-4\">What can I do?</h2>\n",
+      "                          <h3 class=\"text-15 font-semibold mb-2\">If you're a visitor of this website:</h3>\n",
+      "      <p class=\"mb-6\">Please try again in a few minutes.</p>\n",
+      "\n",
+      "      <h3 class=\"text-15 font-semibold mb-2\">If you're the owner of this website:</h3>\n",
+      "      <p><span>The connection to the origin web server was made, but the origin web server timed out before responding. The likely cause is an overloaded background task, database or application, stressing the resources on your web server. To resolve, please work with your hosting provider or web development team to free up resources for your database or overloaded application.</span> <a rel=\"noopener noreferrer\" href=\"https://support.cloudflare.com/hc/en-us/articles/200171926-Error-524\">Additional troubleshooting information here.</a></p>\n",
+      "                </div>\n",
+      "            </div>\n",
+      "              \n",
+      "        </div>\n",
+      "\n",
+      "        <div class=\"cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300\">\n",
+      "  <p class=\"text-13\">\n",
+      "    <span class=\"cf-footer-item sm:block sm:mb-1\">Cloudflare Ray ID: <strong class=\"font-semibold\">7afcc302285ce252</strong></span>\n",
+      "    <span class=\"cf-footer-separator sm:hidden\">&bull;</span>\n",
+      "    <span id=\"cf-footer-item-ip\" class=\"cf-footer-item hidden sm:block sm:mb-1\">\n",
+      "      Your IP:\n",
+      "      <button type=\"button\" id=\"cf-footer-ip-reveal\" class=\"cf-footer-ip-reveal-btn\">Click to reveal</button>\n",
+      "      <span class=\"hidden\" id=\"cf-footer-ip\">20.241.3.188</span>\n",
+      "      <span class=\"cf-footer-separator sm:hidden\">&bull;</span>\n",
+      "    </span>\n",
+      "    <span class=\"cf-footer-item sm:block sm:mb-1\"><span>Performance &amp; security by</span> <a rel=\"noopener noreferrer\" href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.openai.com\" id=\"brand_link\" target=\"_blank\">Cloudflare</a></span>\n",
+      "    \n",
+      "  </p>\n",
+      "  <script>(function(){function d(){var b=a.getElementById(\"cf-footer-item-ip\"),c=a.getElementById(\"cf-footer-ip-reveal\");b&&\"classList\"in b&&(b.classList.remove(\"hidden\"),c.addEventListener(\"click\",function(){c.classList.add(\"hidden\");a.getElementById(\"cf-footer-ip\").classList.remove(\"hidden\")}))}var a=document;document.addEventListener&&a.addEventListener(\"DOMContentLoaded\",d)})();</script>\n",
+      "</div><!-- /.error-footer -->\n",
+      "\n",
+      "\n",
+      "    </div>\n",
+      "</div>\n",
+      "</body>\n",
+      "</html>\n",
+      "\n",
+      ")\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'expected_success': 0.9181755223880596, 'success': 0.9552238805970149, 'success_vote': 0.8756218905472637, 'voted_answer': \"To figure out how many kilowatts of power Eric's car can generate, we need to find the conversion factor for metric horsepower to kilowatts. To do this, we start by dividing the power in Eric's car in horsepower by the number of kilowatts per horsepower: $$\\\\frac{500\\\\text{ hp}}{1.36\\\\text{ hp/kW}}$$Now, to get to kilowatts, we divide by 1 hp, which gives us $$\\\\frac{500}{1.36}\\\\approx \\\\boxed{368}\\\\text{ kW}$$\", 'total_cost': 14.071600000000004, 'cost': 8.06211, 'inference_cost': 0.039892067164179104}\n"
+      "{'expected_success': 0.9197771144278613, 'success': 0.9552238805970149, 'success_vote': 0.8656716417910447, 'voted_answer': \"Since a kilowatt is equivalent to $1.36$ horsepower, then Eric's $500$ horsepower engine can generate $\\\\frac{500}{1.36}$ kilowatts, or about $\\\\boxed{368}$ kilowatts.\", 'total_cost': 16.885947999999996, 'cost': 7.518809999999999, 'inference_cost': 0.03718908208955224}\n"
      ]
     }
    ],
    "source": [
-    "# The following evaluation costs $8 and nearly one hour if you uncomment it and run it.\n",
+    "# The following evaluation costs $8 and longer than one hour if you uncomment it and run it.\n",
     "\n",
-    "# config_larger = {\"model\": 'gpt-4', \"prompt\": 0, \"n\": 5}\n",
-    "# default_result = oai.ChatCompletion.eval(config_larger, prune=False, eval_only=True)\n",
-    "# print(default_result)"
+    "config_n5 = {\"model\": 'gpt-4', \"prompt\": 0, \"n\": 5}\n",
+    "n5_result = oai.ChatCompletion.eval(config_n5, prune=False, eval_only=True)\n",
+    "print(n5_result)"
    ]
   },
   {
diff --git a/notebook/integrate_openai.ipynb b/notebook/integrate_openai.ipynb
index 6b7434265c..595d50be1c 100644
--- a/notebook/integrate_openai.ipynb
+++ b/notebook/integrate_openai.ipynb
@@ -93,7 +93,7 @@
    "source": [
     "# openai.api_type = \"azure\"\n",
     "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
-    "# openai.api_version = \"2022-12-01\"  # change if necessary"
+    "# openai.api_version = \"2023-03-15-preview\"  # change if necessary"
    ]
   },
   {
@@ -838,7 +838,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.16"
+   "version": "3.9.16 (main, Feb 11 2023, 02:49:26) \n[GCC 10.2.1 20210110]"
   },
   "vscode": {
    "interpreter": {
diff --git a/website/docs/Examples/Integrate - OpenAI.md b/website/docs/Examples/Integrate - OpenAI.md
index f6289fd3ce..fcf73ff2b5 100644
--- a/website/docs/Examples/Integrate - OpenAI.md	
+++ b/website/docs/Examples/Integrate - OpenAI.md	
@@ -21,7 +21,7 @@ If you use Azure OpenAI, set up Azure using the following code:
 ```python
 openai.api_type = "azure"
 openai.api_base = "https://<your_endpoint>.openai.azure.com/"
-openai.api_version = "2022-12-01"  # change if necessary
+openai.api_version = "2023-03-15-preview"  # change if necessary
 ```
 
 ### Load the dataset

From 6c92d567fa689e29cd9ce6496275d08ca7d136a3 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 30 Mar 2023 05:59:46 +0000
Subject: [PATCH 02/27] cleanup

---
 notebook/integrate_chatgpt.ipynb | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/notebook/integrate_chatgpt.ipynb b/notebook/integrate_chatgpt.ipynb
index 3740cb783e..dd1206d4ad 100644
--- a/notebook/integrate_chatgpt.ipynb
+++ b/notebook/integrate_chatgpt.ipynb
@@ -67,9 +67,7 @@
     "import os\n",
     "\n",
     "if \"OPENAI_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\"\n",
-    "import openai\n",
-    "openai.api_key_path = \"key.txt\""
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
    ]
   },
   {

From 54bf87b2e714f177daa36f74ca5948dc5d8eb4d8 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Sun, 2 Apr 2023 18:21:19 +0000
Subject: [PATCH 03/27] code utils

---
 flaml/__init__.py                    |   2 +-
 flaml/autogen/code_utils.py          | 148 +++++++++++++++++++++++++++
 flaml/autogen/math_utils.py          |   3 +-
 flaml/autogen/oai/__init__.py        |   3 +
 flaml/integrations/__init__.py       |   0
 flaml/integrations/oai/__init__.py   |   3 -
 flaml/integrations/oai/completion.py |  93 +++++++++++++++--
 7 files changed, 237 insertions(+), 15 deletions(-)
 create mode 100644 flaml/autogen/code_utils.py
 create mode 100644 flaml/autogen/oai/__init__.py
 delete mode 100644 flaml/integrations/__init__.py
 delete mode 100644 flaml/integrations/oai/__init__.py

diff --git a/flaml/__init__.py b/flaml/__init__.py
index a0a6138880..9fca486499 100644
--- a/flaml/__init__.py
+++ b/flaml/__init__.py
@@ -2,7 +2,7 @@
 from flaml.automl import AutoML, logger_formatter
 from flaml.tune.searcher import CFO, BlendSearch, FLOW2, BlendSearchTuner, RandomSearch
 from flaml.onlineml.autovw import AutoVW
-from flaml.integrations import oai
+from flaml.autogen import oai
 from flaml.version import __version__
 
 
diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
new file mode 100644
index 0000000000..1d33357e30
--- /dev/null
+++ b/flaml/autogen/code_utils.py
@@ -0,0 +1,148 @@
+import signal
+import subprocess
+import sys
+from typing import List, Dict, Tuple, Optional, Union, Callable
+from flaml import oai
+
+
+def timeout_handler(signum, frame):
+    raise TimeoutError("Timed out!")
+
+
+def execute_code(code: str, max_exec_time: Optional[int] = 3):
+    signal.signal(signal.SIGALRM, timeout_handler)
+    code = code.strip()
+    with open("codetest.py", "w") as fout:
+        fout.write(code)
+    try:
+        signal.alarm(max_exec_time)
+        result = subprocess.run(
+            [sys.executable, "codetest.py"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+        )
+        signal.alarm(0)
+    except TimeoutError:
+        return 0
+    return int(result.returncode == 0)
+
+
+def generate_assertions(
+    definition: str, model: Optional[str] = "gpt-3.5-turbo"
+) -> Tuple[str, float]:
+    """Generate assertions for a function.
+
+    Args:
+        definition (str): The function definition, including the signature and docstr.
+        model (str): The model used for generation.
+
+    Returns:
+        str: The generated assertions.
+        float: The cost of the generation.
+    """
+    prompt = """Given the signature and docstring, write the exactly same number of assertion(s) for the provided example(s) in the docstring, without assertion messages.
+
+func signature:
+{definition}
+assertions:"""
+    response = oai.Completion.create(
+        {"definition": definition},
+        model=model,
+        prompt=prompt,
+        max_tokens=256,
+        stop="\n\n",
+    )
+    cost = oai.Completion.cost(model, response)
+    assertions = oai.Completion.extract_text(response)[0]
+    return assertions, cost
+
+
+def success_metrics(
+    responses: List[str],
+    definition: str,
+    test: Optional[str] = None,
+    entry_point: Optional[str] = None,
+    assertions: Optional[Union[str, Callable[[str], Tuple[str, float]]]] = None,
+) -> Dict:
+    """Check if the task is successful.
+
+    Args:
+        responses (list): The list of responses.
+        definition (str): The input definition.
+        test (Optional, str): The test code.
+        entry_point (Optional, str): The name of the function.
+        assertions (Optional, str or Callable): The assertion code which serves as a filter of the responses, or an assertion generator.
+            When provided, only the responses that pass the assertions will be considered for the actual test (if provided).
+
+    Returns:
+        dict: The success metrics.
+    """
+    n = len(responses)
+    if assertions is None:
+        # no assertion filter
+        success_list = []
+        for i in range(n):
+            response = responses[i]
+            code = f"{definition}{response}\n{test}\ncheck({entry_point})"
+            success = execute_code(code)
+            success_list.append(success)
+        return {
+            "expected_success": 1 - pow(1 - sum(success_list) / n, n),
+            "success": any(s for s in success_list),
+        }
+    if callable(assertions) and n > 1:
+        # assertion generator
+        assertions, gen_cost = assertions(definition)
+    else:
+        gen_cost = 0
+    if n > 1 or test is None:
+        for i in range(n):
+            response = responses[i]
+            code = (
+                f"{response}\n{assertions}"
+                if response.startswith("def")
+                else f"{definition}{response}\n{assertions}"
+            )
+            succeed_assertions = execute_code(code)
+            if succeed_assertions:
+                break
+    else:
+        # just test, no need to check assertions
+        succeed_assertions = False
+        i, response = 0, responses[0]
+    if test is None:
+        # no test code
+        return {"succeed_assertions": succeed_assertions}
+    code_test = (
+        f"{response}\n{test}\ncheck({entry_point})"
+        if response.startswith("def")
+        else f"{definition}{response}\n{test}\ncheck({entry_point})"
+    )
+    success = execute_code(code_test)
+    return {
+        "index_selected": i,
+        "succeed_assertions": succeed_assertions,
+        "success": success,
+        "gen_cost": gen_cost,
+    }
+
+
+def implement(definition: str, configs: List[Dict]) -> Tuple[str, float]:
+    """Implement a function.
+
+    Args:
+        definition (str): The function definition, including the signature and docstr.
+        configs (list): The list of configurations for completion.
+
+    Returns:
+        str: The implementation.
+        float: The cost of the implementation.
+    """
+    assertions, cost = generate_assertions(definition)
+    for i, config in enumerate(configs):
+        response = oai.Completion.create({"definition": definition}, **config)
+        cost += oai.Completion.cost(config["model"], response)
+        responses = oai.Completion.extract_text(response)
+        metrics = success_metrics(responses, definition, assertions=assertions)
+        if metrics["succeed_assertions"] or i == len(configs) - 1:
+            return responses[metrics["index_selected"]], cost
diff --git a/flaml/autogen/math_utils.py b/flaml/autogen/math_utils.py
index 94388b4b26..76ba52d979 100644
--- a/flaml/autogen/math_utils.py
+++ b/flaml/autogen/math_utils.py
@@ -299,7 +299,7 @@ def success_metrics(responses, solution, **args):
     # voting
     answers = voting_counts(responses)
     # find the answer with highest votes in answers
-    answer = max(answers.items(), key=lambda x: x[1], default=(0, 0))[0]
+    answer, votes = max(answers.items(), key=lambda x: x[1], default=(0, 0))
     # check if the answer is correct
     success_vote = is_equiv_chain_of_thought(responses[answer], solution)
     return {
@@ -307,4 +307,5 @@ def success_metrics(responses, solution, **args):
         "success": any(s for s in success_list),
         "success_vote": success_vote,
         "voted_answer": responses[answer],
+        "votes": votes,
     }
diff --git a/flaml/autogen/oai/__init__.py b/flaml/autogen/oai/__init__.py
new file mode 100644
index 0000000000..c845911b42
--- /dev/null
+++ b/flaml/autogen/oai/__init__.py
@@ -0,0 +1,3 @@
+from flaml.autogen.oai.completion import Completion, ChatCompletion
+
+__all__ = ["Completion", "ChatCompletion"]
diff --git a/flaml/integrations/__init__.py b/flaml/integrations/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/flaml/integrations/oai/__init__.py b/flaml/integrations/oai/__init__.py
deleted file mode 100644
index 12320692d1..0000000000
--- a/flaml/integrations/oai/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from flaml.integrations.oai.completion import Completion, ChatCompletion
-
-__all__ = ["Completion", "ChatCompletion"]
diff --git a/flaml/integrations/oai/completion.py b/flaml/integrations/oai/completion.py
index 41c2eff363..f9d987e01d 100644
--- a/flaml/integrations/oai/completion.py
+++ b/flaml/integrations/oai/completion.py
@@ -2,6 +2,7 @@
 import logging
 import numpy as np
 import time
+from typing import List
 from flaml import tune, BlendSearch
 
 try:
@@ -214,6 +215,13 @@ def _update_invalid_n(cls, prune, region_key, max_tokens, num_completions):
                 num_completions, invalid_n.get(max_tokens, np.inf)
             )
 
+    @classmethod
+    def _pop_subspace(cls, config):
+        if "subspace" in config:
+            config = config.copy()
+            config.update(config.pop("subspace"))
+        return config
+
     @classmethod
     def eval(cls, config: dict, prune=True, eval_only=False):
         """Evaluate the given config as the hyperparameter setting for the openai api call.
@@ -229,6 +237,7 @@ def eval(cls, config: dict, prune=True, eval_only=False):
         """
         cost = 0
         data = cls.data
+        config = cls._pop_subspace(config)
         model = config["model"]
         data_length = len(data)
         price = cls.price1K.get(model)
@@ -586,22 +595,38 @@ def eval_func(responses, **data):
         cls.data = data
         cls.avg_input_tokens = None
 
-        search_alg = BlendSearch(
-            cost_attr="cost",
-            cost_budget=optimization_budget,
-            metric=metric,
-            mode=mode,
-            space=space,
-        )
         space_model = space["model"]
         if not isinstance(space_model, str) and len(space_model) > 1:
+            # make a hierarchical search space
+            subspace = {}
+            if "max_tokens" in space:
+                subspace["max_tokens"] = space.pop("max_tokens")
+            if "temperature_or_top_p" in space:
+                subspace["temperature_or_top_p"] = space.pop("temperature_or_top_p")
+            if "best_of" in space:
+                subspace["best_of"] = space.pop("best_of")
+            if "n" in space:
+                subspace["n"] = space.pop("n")
+            choices = []
+            for model in space["model"]:
+                choices.append({"model": model, **subspace})
+            space["subspace"] = tune.choice(choices)
+            space.pop("model")
             # start all the models with the same hp config
+            search_alg = BlendSearch(
+                cost_attr="cost",
+                cost_budget=optimization_budget,
+                metric=metric,
+                mode=mode,
+                space=space,
+            )
             config0 = search_alg.suggest("t0")
             points_to_evaluate = [config0]
             for model in space_model:
-                if model != config0["model"]:
+                if model != config0["subspace"]["model"]:
                     point = config0.copy()
-                    point["model"] = model
+                    point["subspace"] = point["subspace"].copy()
+                    point["subspace"]["model"] = model
                     points_to_evaluate.append(point)
             search_alg = BlendSearch(
                 cost_attr="cost",
@@ -611,6 +636,14 @@ def eval_func(responses, **data):
                 space=space,
                 points_to_evaluate=points_to_evaluate,
             )
+        else:
+            search_alg = BlendSearch(
+                cost_attr="cost",
+                cost_budget=optimization_budget,
+                metric=metric,
+                mode=mode,
+                space=space,
+            )
         logger.setLevel(logging_level)
         with diskcache.Cache(cls.cache_path) as cls._cache:
             analysis = tune.run(
@@ -621,7 +654,7 @@ def eval_func(responses, **data):
                 verbose=3,
             )
         config = analysis.best_config
-        params = config.copy()
+        params = cls._pop_subspace(config)
         if cls._prompts:
             params["prompt"] = cls._prompts[config["prompt"]]
         else:
@@ -677,10 +710,50 @@ def create(cls, context, use_cache=True, **config):
                 prompt.format(**context) if isinstance(prompt, str) else prompt(context)
             )
         if use_cache:
+            seed = cls.seed
+            if "seed" in params:
+                cls.set_cache(params.pop("seed"))
             with diskcache.Cache(cls.cache_path) as cls._cache:
+                cls.set_cache(seed)
                 return cls._get_response(params)
         return cls.openai_completion_class.create(**params)
 
+    @classmethod
+    def cost(cls, model: str, response: dict):
+        """Compute the cost of a completion.
+
+        Args:
+            model (str): The model name.
+            response (dict): The response from OpenAI API.
+
+        Returns:
+            The cost in USD.
+        """
+        if model not in cls.price1K:
+            raise ValueError(f"Unknown model: {model}")
+        usage = response["usage"]
+        n_input_tokens = usage["prompt_tokens"]
+        n_output_tokens = usage.get("completion_tokens", 0)
+        price1K = cls.price1K[model]
+        if isinstance(price1K, tuple):
+            return (price1K[0] * n_input_tokens + price1K[1] * n_output_tokens) / 1000
+        return price1K * (n_input_tokens + n_output_tokens) / 1000
+
+    @classmethod
+    def extract_text(cls, response: dict) -> List[str]:
+        """Extract the text from a completion response.
+
+        Args:
+            response (dict): The response from OpenAI API.
+
+        Returns:
+            The text.
+        """
+        choices = response["choices"]
+        if "text" in choices[0]:
+            return [choice["text"] for choice in choices]
+        return [choice["message"]["content"] for choice in choices]
+
 
 class ChatCompletion(Completion):
     """A class for OpenAI API ChatCompletion."""

From a2242797affd5014e85d89c6b146c00a02925d8d Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Mon, 3 Apr 2023 01:54:24 +0000
Subject: [PATCH 04/27] remove check function from code response

---
 flaml/autogen/code_utils.py                 |  19 +-
 notebook/autogen_openai.ipynb               |   2 +-
 test/openai/test_completion.py              | 580 +++++---------------
 test/openai/test_notebook.py                |  12 +-
 website/docs/Examples/Integrate - OpenAI.md |  10 +-
 5 files changed, 173 insertions(+), 450 deletions(-)

diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
index 1d33357e30..6392e5acd8 100644
--- a/flaml/autogen/code_utils.py
+++ b/flaml/autogen/code_utils.py
@@ -57,6 +57,15 @@ def generate_assertions(
     return assertions, cost
 
 
+def _remove_check(response):
+    """Remove the check function from the response."""
+    # find the position of the check function
+    pos = response.find("def check")
+    if pos == -1:
+        return response
+    return response[:pos]
+
+
 def success_metrics(
     responses: List[str],
     definition: str,
@@ -82,7 +91,7 @@ def success_metrics(
         # no assertion filter
         success_list = []
         for i in range(n):
-            response = responses[i]
+            response = _remove_check(responses[i])
             code = f"{definition}{response}\n{test}\ncheck({entry_point})"
             success = execute_code(code)
             success_list.append(success)
@@ -97,7 +106,7 @@ def success_metrics(
         gen_cost = 0
     if n > 1 or test is None:
         for i in range(n):
-            response = responses[i]
+            response = responses[i] = _remove_check(responses[i])
             code = (
                 f"{response}\n{assertions}"
                 if response.startswith("def")
@@ -112,7 +121,11 @@ def success_metrics(
         i, response = 0, responses[0]
     if test is None:
         # no test code
-        return {"succeed_assertions": succeed_assertions}
+        return {
+            "index_selected": i,
+            "succeed_assertions": succeed_assertions,
+            "gen_cost": gen_cost,
+        }
     code_test = (
         f"{response}\n{test}\ncheck({entry_point})"
         if response.startswith("def")
diff --git a/notebook/autogen_openai.ipynb b/notebook/autogen_openai.ipynb
index 457d0df3be..cdbf803dc2 100644
--- a/notebook/autogen_openai.ipynb
+++ b/notebook/autogen_openai.ipynb
@@ -296,7 +296,7 @@
     "from functools import partial\n",
     "from flaml.autogen.code_utils import success_metrics, generate_assertions\n",
     "\n",
-    "success_metrics_with_generated_assertions = partial(success_metrics,assertions=generate_assertions)"
+    "success_metrics_with_generated_assertions = partial(success_metrics, assertions=generate_assertions)"
    ]
   },
   {
diff --git a/test/openai/test_completion.py b/test/openai/test_completion.py
index d20f0f63c9..afc738c073 100644
--- a/test/openai/test_completion.py
+++ b/test/openai/test_completion.py
@@ -1,10 +1,11 @@
 import datasets
-import signal
-import subprocess
 import sys
 import numpy as np
 import pytest
+from functools import partial
 from flaml import oai
+from flaml.autogen.code_utils import success_metrics, generate_assertions, implement
+from flaml.autogen.math_utils import success_metrics as math_success_metrics
 
 
 @pytest.mark.skipif(
@@ -12,58 +13,16 @@
     reason="do not run on windows",
 )
 def test_humaneval(num_samples=1):
-    def timeout_handler(signum, frame):
-        raise TimeoutError("Timed out!")
-
-    signal.signal(signal.SIGALRM, timeout_handler)
-    max_exec_time = 3  # seconds
-
-    def execute_code(code):
-        code = code.strip()
-        with open("codetest.py", "w") as fout:
-            fout.write(code)
-        try:
-            signal.alarm(max_exec_time)
-            result = subprocess.run(
-                [sys.executable, "codetest.py"],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.PIPE,
-            )
-            signal.alarm(0)
-        except TimeoutError:
-            return 0
-        return int(result.returncode == 0)
-
-    def success_metrics(responses, prompt, test, entry_point):
-        """Check if the response is correct.
-
-        Args:
-            responses (list): The list of responses.
-            prompt (str): The input prompt.
-            test (str): The test code.
-            entry_point (str): The name of the function.
-
-        Returns:
-            dict: The success metrics.
-        """
-        success_list = []
-        n = len(responses)
-        for i in range(n):
-            response = responses[i]
-            code = f"{prompt}{response}\n{test}\ncheck({entry_point})"
-            succeed = execute_code(code)
-            success_list.append(succeed)
-        return {
-            "expected_success": 1 - pow(1 - np.mean(success_list), n),
-            "success": any(s for s in success_list),
-        }
+    success_metrics_with_generated_assertions = partial(
+        success_metrics, assertions=generate_assertions
+    )
 
     seed = 41
     data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed)
     n_tune_data = 20
     tune_data = [
         {
-            "prompt": data[x]["prompt"],
+            "definition": data[x]["prompt"],
             "test": data[x]["test"],
             "entry_point": data[x]["entry_point"],
         }
@@ -71,7 +30,7 @@ def success_metrics(responses, prompt, test, entry_point):
     ]
     test_data = [
         {
-            "prompt": data[x]["prompt"],
+            "definition": data[x]["prompt"],
             "test": data[x]["test"],
             "entry_point": data[x]["entry_point"],
         }
@@ -79,335 +38,77 @@ def success_metrics(responses, prompt, test, entry_point):
     ]
     oai.Completion.set_cache(seed)
     try:
-        # a minimal tuning example
-        config, _ = oai.Completion.tune(
-            data=tune_data,
-            metric="success",
-            mode="max",
-            eval_func=success_metrics,
-            n=1,
-        )
-        responses = oai.Completion.create(context=test_data[0], **config)
-        # a minimal tuning example for tuning chat completion models using the Completion class
-        config, _ = oai.Completion.tune(
-            data=tune_data,
-            metric="success",
-            mode="max",
-            eval_func=success_metrics,
-            n=1,
-            model="gpt-3.5-turbo",
-        )
-        responses = oai.Completion.create(context=test_data[0], **config)
-        # a minimal tuning example for tuning chat completion models using the Completion class
-        config, _ = oai.ChatCompletion.tune(
-            data=tune_data,
-            metric="success",
-            mode="max",
-            eval_func=success_metrics,
-            n=1,
-            messages=[{"role": "user", "content": "{prompt}"}],
-        )
-        responses = oai.ChatCompletion.create(context=test_data[0], **config)
-        print(responses)
-        # a more comprehensive tuning example
-        config, analysis = oai.Completion.tune(
-            data=tune_data,
-            metric="expected_success",
-            mode="max",
-            eval_func=success_metrics,
-            log_file_name="logs/humaneval.log",
-            inference_budget=0.002,
-            optimization_budget=2,
-            num_samples=num_samples,
-            prompt=[
-                "{prompt}",
-                "# Python 3{prompt}",
-                "Complete the following Python function:{prompt}",
-                "Complete the following Python function while including necessary import statements inside the function:{prompt}",
-            ],
-            stop=["\nclass", "\ndef", "\nif", "\nprint"],
-        )
-        print(config)
-        print(analysis.best_result)
-        print(test_data[0])
-        responses = oai.Completion.create(context=test_data[0], **config)
-        print(responses)
-        oai.Completion.data = test_data[:num_samples]
-        result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True)
-        print("result without pruning", result)
-        result = oai.Completion.test(test_data[:num_samples], config=config)
-        print(result)
+        import openai
+        import diskcache
     except ImportError as exc:
         print(exc)
+        return
+    # a minimal tuning example
+    config, _ = oai.Completion.tune(
+        data=tune_data,
+        metric="success",
+        mode="max",
+        eval_func=success_metrics,
+        n=1,
+        prompt="{definition}",
+    )
+    responses = oai.Completion.create(context=test_data[0], **config)
+    # a minimal tuning example for tuning chat completion models using the Completion class
+    config, _ = oai.Completion.tune(
+        data=tune_data,
+        metric="succeed_assertions",
+        mode="max",
+        eval_func=success_metrics_with_generated_assertions,
+        n=1,
+        model="gpt-3.5-turbo",
+        prompt="{definition}",
+    )
+    responses = oai.Completion.create(context=test_data[0], **config)
+    # a minimal tuning example for tuning chat completion models using the Completion class
+    config, _ = oai.ChatCompletion.tune(
+        data=tune_data,
+        metric="expected_success",
+        mode="max",
+        eval_func=success_metrics,
+        n=1,
+        messages=[{"role": "user", "content": "{definition}"}],
+    )
+    responses = oai.ChatCompletion.create(context=test_data[0], **config)
+    print(responses)
+    code, cost = implement(tune_data[1], [config])
+    print(code)
+    print(cost)
+    print(success_metrics([code], **tune_data[1]))
+    # a more comprehensive tuning example
+    config, analysis = oai.Completion.tune(
+        data=tune_data,
+        metric="success",
+        mode="max",
+        eval_func=success_metrics_with_generated_assertions,
+        log_file_name="logs/humaneval.log",
+        inference_budget=0.002,
+        optimization_budget=2,
+        num_samples=num_samples,
+        prompt=[
+            "{definition}",
+            "# Python 3{definition}",
+            "Complete the following Python function:{definition}",
+        ],
+        stop=[["\nclass", "\ndef", "\nif", "\nprint"], None],  # the stop sequences
+    )
+    print(config)
+    print(analysis.best_result)
+    print(test_data[0])
+    responses = oai.Completion.create(context=test_data[0], **config)
+    print(responses)
+    oai.Completion.data = test_data[:num_samples]
+    result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True)
+    print("result without pruning", result)
+    result = oai.Completion.test(test_data[:num_samples], config=config)
+    print(result)
 
 
 def test_math(num_samples=-1):
-    from typing import Optional
-
-    def remove_boxed(string: str) -> Optional[str]:
-        """Source: https://github.com/hendrycks/math
-        Extract the text within a \\boxed{...} environment.
-        Example:
-        >>> remove_boxed(\\boxed{\\frac{2}{3}})
-        \\frac{2}{3}
-        """
-        left = "\\boxed{"
-        try:
-            assert string[: len(left)] == left
-            assert string[-1] == "}"
-            return string[len(left) : -1]
-        except Exception:
-            return None
-
-    def last_boxed_only_string(string: str) -> Optional[str]:
-        """Source: https://github.com/hendrycks/math
-        Extract the last \\boxed{...} or \\fbox{...} element from a string.
-        """
-        idx = string.rfind("\\boxed")
-        if idx < 0:
-            idx = string.rfind("\\fbox")
-            if idx < 0:
-                return None
-
-        i = idx
-        right_brace_idx = None
-        num_left_braces_open = 0
-        while i < len(string):
-            if string[i] == "{":
-                num_left_braces_open += 1
-            if string[i] == "}":
-                num_left_braces_open -= 1
-                if num_left_braces_open == 0:
-                    right_brace_idx = i
-                    break
-            i += 1
-
-        if right_brace_idx is None:
-            retval = None
-        else:
-            retval = string[idx : right_brace_idx + 1]
-
-        return retval
-
-    def _fix_fracs(string: str) -> str:
-        """Source: https://github.com/hendrycks/math
-        Reformat fractions.
-        Examples:
-        >>> _fix_fracs("\\frac1b")
-        \frac{1}{b}
-        >>> _fix_fracs("\\frac12")
-        \frac{1}{2}
-        >>> _fix_fracs("\\frac1{72}")
-        \frac{1}{72}
-        """
-        substrs = string.split("\\frac")
-        new_str = substrs[0]
-        if len(substrs) > 1:
-            substrs = substrs[1:]
-            for substr in substrs:
-                new_str += "\\frac"
-                if substr[0] == "{":
-                    new_str += substr
-                else:
-                    try:
-                        assert len(substr) >= 2
-                    except Exception:
-                        return string
-                    a = substr[0]
-                    b = substr[1]
-                    if b != "{":
-                        if len(substr) > 2:
-                            post_substr = substr[2:]
-                            new_str += "{" + a + "}{" + b + "}" + post_substr
-                        else:
-                            new_str += "{" + a + "}{" + b + "}"
-                    else:
-                        if len(substr) > 2:
-                            post_substr = substr[2:]
-                            new_str += "{" + a + "}" + b + post_substr
-                        else:
-                            new_str += "{" + a + "}" + b
-        string = new_str
-        return string
-
-    def _fix_a_slash_b(string: str) -> str:
-        """Source: https://github.com/hendrycks/math
-        Reformat fractions formatted as a/b to \\frac{a}{b}.
-        Example:
-        >>> _fix_a_slash_b("2/3")
-        \frac{2}{3}
-        """
-        if len(string.split("/")) != 2:
-            return string
-        a_str = string.split("/")[0]
-        b_str = string.split("/")[1]
-        try:
-            a = int(a_str)
-            b = int(b_str)
-            assert string == "{}/{}".format(a, b)
-            new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
-            return new_string
-        except Exception:
-            return string
-
-    def _remove_right_units(string: str) -> str:
-        """Source: https://github.com/hendrycks/math"""
-        if "\\text{ " in string:
-            splits = string.split("\\text{ ")
-            assert len(splits) == 2
-            return splits[0]
-        else:
-            return string
-
-    def _fix_sqrt(string: str) -> str:
-        """Source: https://github.com/hendrycks/math"""
-        if "\\sqrt" not in string:
-            return string
-        splits = string.split("\\sqrt")
-        new_string = splits[0]
-        for split in splits[1:]:
-            if split[0] != "{":
-                a = split[0]
-                new_substr = "\\sqrt{" + a + "}" + split[1:]
-            else:
-                new_substr = "\\sqrt" + split
-            new_string += new_substr
-        return new_string
-
-    def _strip_string(string: str) -> str:
-        """Source: https://github.com/hendrycks/math
-        Apply the reformatting helper functions above.
-        """
-        # linebreaks
-        string = string.replace("\n", "")
-        # print(string)
-
-        # remove inverse spaces
-        string = string.replace("\\!", "")
-        # print(string)
-
-        # replace \\ with \
-        string = string.replace("\\\\", "\\")
-        # print(string)
-
-        # replace tfrac and dfrac with frac
-        string = string.replace("tfrac", "frac")
-        string = string.replace("dfrac", "frac")
-        # print(string)
-
-        # remove \left and \right
-        string = string.replace("\\left", "")
-        string = string.replace("\\right", "")
-        # print(string)
-
-        # Remove circ (degrees)
-        string = string.replace("^{\\circ}", "")
-        string = string.replace("^\\circ", "")
-
-        # remove dollar signs
-        string = string.replace("\\$", "")
-
-        # remove units (on the right)
-        string = _remove_right_units(string)
-
-        # remove percentage
-        string = string.replace("\\%", "")
-        string = string.replace(r"\%", "")
-
-        # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
-        string = string.replace(" .", " 0.")
-        string = string.replace("{.", "{0.")
-        # if empty, return empty string
-        if len(string) == 0:
-            return string
-        if string[0] == ".":
-            string = "0" + string
-
-        # to consider: get rid of e.g. "k = " or "q = " at beginning
-        if len(string.split("=")) == 2:
-            if len(string.split("=")[0]) <= 2:
-                string = string.split("=")[1]
-
-        # fix sqrt3 --> sqrt{3}
-        string = _fix_sqrt(string)
-
-        # remove spaces
-        string = string.replace(" ", "")
-
-        # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc.
-        # Even works with \frac1{72} (but not \frac{72}1).
-        # Also does a/b --> \\frac{a}{b}
-        string = _fix_fracs(string)
-
-        # manually change 0.5 --> \frac{1}{2}
-        if string == "0.5":
-            string = "\\frac{1}{2}"
-
-        # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
-        string = _fix_a_slash_b(string)
-
-        return string
-
-    def get_answer(solution: Optional[str]) -> Optional[str]:
-        if solution is None:
-            return None
-        last_boxed = last_boxed_only_string(solution)
-        if last_boxed is None:
-            return None
-        answer = remove_boxed(last_boxed)
-        if answer is None:
-            return None
-        return answer
-
-    def is_equiv(str1: Optional[str], str2: Optional[str]) -> float:
-        """Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in
-        - units
-        - fractions
-        - square roots
-        - superfluous LaTeX.
-        Source: https://github.com/hendrycks/math
-        """
-        if str1 is None and str2 is None:
-            print("WARNING: Both None")
-            return 1.0
-        if str1 is None or str2 is None:
-            return 0.0
-
-        try:
-            ss1 = _strip_string(str1)
-            ss2 = _strip_string(str2)
-            return float(ss1 == ss2)
-        except Exception:
-            return float(str1 == str2)
-
-    def is_equiv_chain_of_thought(str1: str, str2: str) -> float:
-        """Strips the solution first before calling `is_equiv`."""
-        ans1 = get_answer(str1)
-        ans2 = get_answer(str2)
-
-        return is_equiv(ans1, ans2)
-
-    def success_metrics(responses, solution, **args):
-        """Check if each response is correct.
-
-        Args:
-            responses (list): The list of responses.
-            solution (str): The canonical solution.
-
-        Returns:
-            dict: The success metrics.
-        """
-        success_list = []
-        n = len(responses)
-        for i in range(n):
-            response = responses[i]
-            succeed = is_equiv_chain_of_thought(response, solution)
-            success_list.append(succeed)
-        return {
-            "expected_success": 1 - pow(1 - sum(success_list) / n, n),
-            "success": any(s for s in success_list),
-        }
-
     seed = 41
     data = datasets.load_dataset("competition_math")
     train_data = data["train"].shuffle(seed=seed)
@@ -436,78 +137,87 @@ def success_metrics(responses, solution, **args):
     print(len(tune_data), len(test_data))
     # prompt template
     prompts = [
-        lambda data: "Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\n###\nProblem: What is the value of $\\sqrt{3! \\cdot 3!}$ expressed as a positive integer?\nAnswer: $\\sqrt{3!\\cdot3!}$ is equal to $\\sqrt{(3!)^2}=3!=3\\cdot2\\cdot1=\\boxed{6}$.\n###\nProblem: %s\nAnswer:"
-        + data["problem"]
+        lambda data: "%s Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\boxed{}."
+        % data["problem"]
     ]
 
     try:
-        oai.ChatCompletion.set_cache(seed)
-        vanilla_config = {
-            "model": "gpt-3.5-turbo",
-            "temperature": 1,
-            "max_tokens": 2048,
-            "n": 1,
-            "prompt": prompts[0],
-            "stop": "###",
-        }
-        test_data_sample = test_data[0:3]
-        result = oai.ChatCompletion.test(
-            test_data_sample, vanilla_config, success_metrics
-        )
-        test_data_sample = test_data[3:6]
-        result = oai.ChatCompletion.test(
-            test_data_sample,
-            vanilla_config,
-            success_metrics,
-            use_cache=False,
-            agg_method="median",
-        )
-
-        def my_median(results):
-            return np.median(results)
+        import openai
+        import diskcache
+    except ImportError as exc:
+        print(exc)
+        return
+
+    oai.ChatCompletion.set_cache(seed)
+    vanilla_config = {
+        "model": "gpt-3.5-turbo",
+        "temperature": 1,
+        "max_tokens": 2048,
+        "n": 1,
+        "prompt": prompts[0],
+        "stop": "###",
+    }
+    test_data_sample = test_data[0:3]
+    result = oai.ChatCompletion.test(
+        test_data_sample, vanilla_config, math_success_metrics
+    )
+    test_data_sample = test_data[3:6]
+    result = oai.ChatCompletion.test(
+        test_data_sample,
+        vanilla_config,
+        math_success_metrics,
+        use_cache=False,
+        agg_method="median",
+    )
 
-        def my_average(results):
-            return np.mean(results)
+    def my_median(results):
+        return np.median(results)
 
-        result = oai.ChatCompletion.test(
-            test_data_sample,
-            vanilla_config,
-            success_metrics,
-            use_cache=False,
-            agg_method=my_median,
-        )
-        result = oai.ChatCompletion.test(
-            test_data_sample,
-            vanilla_config,
-            success_metrics,
-            use_cache=False,
-            agg_method={"expected_success": my_median, "success": my_average},
-        )
+    def my_average(results):
+        return np.mean(results)
 
-        print(result)
+    result = oai.ChatCompletion.test(
+        test_data_sample,
+        vanilla_config,
+        math_success_metrics,
+        use_cache=False,
+        agg_method=my_median,
+    )
+    result = oai.ChatCompletion.test(
+        test_data_sample,
+        vanilla_config,
+        math_success_metrics,
+        use_cache=False,
+        agg_method={
+            "expected_success": my_median,
+            "success": my_average,
+            "success_vote": my_average,
+            "votes": np.mean,
+        },
+    )
 
-        config, _ = oai.ChatCompletion.tune(
-            data=tune_data,  # the data for tuning
-            metric="expected_success",  # the metric to optimize
-            mode="max",  # the optimization mode
-            eval_func=success_metrics,  # the evaluation function to return the success metrics
-            # log_file_name="logs/math.log",  # the log file name
-            inference_budget=0.002,  # the inference budget (dollar)
-            optimization_budget=0.01,  # the optimization budget (dollar)
-            num_samples=num_samples,
-            prompt=prompts,  # the prompt templates to choose from
-            stop="###",  # the stop sequence
-        )
-        print("tuned config", config)
-        result = oai.ChatCompletion.test(test_data_sample, config)
-        print("result from tuned config:", result)
-    except (ImportError, NameError) as exc:
-        print(exc)
+    print(result)
+
+    config, _ = oai.ChatCompletion.tune(
+        data=tune_data,  # the data for tuning
+        metric="expected_success",  # the metric to optimize
+        mode="max",  # the optimization mode
+        eval_func=math_success_metrics,  # the evaluation function to return the success metrics
+        # log_file_name="logs/math.log",  # the log file name
+        inference_budget=0.002,  # the inference budget (dollar)
+        optimization_budget=0.01,  # the optimization budget (dollar)
+        num_samples=num_samples,
+        prompt=prompts,  # the prompt templates to choose from
+        stop="###",  # the stop sequence
+    )
+    print("tuned config", config)
+    result = oai.ChatCompletion.test(test_data_sample, config)
+    print("result from tuned config:", result)
 
 
 if __name__ == "__main__":
     import openai
 
     openai.api_key_path = "test/openai/key.txt"
-    test_humaneval(-1)
-    test_math(-1)
+    test_humaneval(1)
+    test_math(1)
diff --git a/test/openai/test_notebook.py b/test/openai/test_notebook.py
index fa68001675..a759543838 100644
--- a/test/openai/test_notebook.py
+++ b/test/openai/test_notebook.py
@@ -45,18 +45,18 @@ def run_notebook(input_nb, output_nb="executed_openai_notebook.ipynb", save=Fals
     skip,
     reason="do not run openai test if openai is not installed",
 )
-def test_integrate_openai(save=False):
-    run_notebook("integrate_openai.ipynb", save=save)
+def test_autogen_openai(save=False):
+    run_notebook("autogen_openai.ipynb", save=save)
 
 
 @pytest.mark.skipif(
     skip,
     reason="do not run openai test if openai is not installed",
 )
-def test_integrate_chatgpt(save=False):
-    run_notebook("integrate_chatgpt.ipynb", save=save)
+def test_autogen_chatgpt(save=False):
+    run_notebook("autogen_chatgpt.ipynb", save=save)
 
 
 if __name__ == "__main__":
-    test_integrate_chatgpt(save=True)
-    test_integrate_openai(save=True)
+    test_autogen_chatgpt(save=True)
+    test_autogen_openai(save=True)
diff --git a/website/docs/Examples/Integrate - OpenAI.md b/website/docs/Examples/Integrate - OpenAI.md
index eff22b0c2a..23c30c16c5 100644
--- a/website/docs/Examples/Integrate - OpenAI.md	
+++ b/website/docs/Examples/Integrate - OpenAI.md	
@@ -3,7 +3,7 @@ In this example, we will tune several hyperparameters for the OpenAI's completio
 
 ### Prerequisites
 
-Install the [openai] option. The OpenAI integration is in preview. ChaptGPT support is available since version 1.2.0.
+Install the [openai] option. The OpenAI integration is in preview.
 ```bash
 pip install "flaml[openai]==1.2.0"
 ```
@@ -60,7 +60,7 @@ Before starting tuning, you need to define the metric for the optimization. For
 from functools import partial
 from flaml.autogen.code_utils import success_metrics, generate_assertions
 
-success_metrics_with_generated_assertions = partial(success_metrics,assertions=generate_assertions)
+success_metrics_with_generated_assertions = partial(success_metrics, assertions=generate_assertions)
 ```
 
 This function will first generate assertion statements for each problem. Then, it uses the assertions to select the generated responses.
@@ -88,9 +88,9 @@ config, analysis = oai.Completion.tune(
     # -1 means decided by the optimization budget only
     num_samples=-1,
     prompt=[
-        "{prompt}",
-        "# Python 3{prompt}",
-        "Complete the following Python function:{prompt}",
+        "{definition}",
+        "# Python 3{definition}",
+        "Complete the following Python function:{definition}",
     ],  # the prompt templates to choose from
     stop=[["\nclass", "\ndef", "\nif", "\nprint"], None],  # the stop sequences
 )

From c277d366b9a24d0f9130496f03a6d8defdd0f122 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Mon, 3 Apr 2023 03:37:43 +0000
Subject: [PATCH 05/27] comment out test

---
 notebook/autogen_openai.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebook/autogen_openai.ipynb b/notebook/autogen_openai.ipynb
index cdbf803dc2..8e5e0d4f38 100644
--- a/notebook/autogen_openai.ipynb
+++ b/notebook/autogen_openai.ipynb
@@ -760,8 +760,8 @@
     }
    ],
    "source": [
-    "result = oai.Completion.test(test_data, config)\n",
-    "print(\"performance on test data with the tuned config:\", result)"
+    "# result = oai.Completion.test(test_data, config)\n",
+    "# print(\"performance on test data with the tuned config:\", result)"
    ]
   },
   {

From 1878981d3b846ef2fd47a66c20cd5386ce2f2d1e Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Mon, 3 Apr 2023 16:16:08 +0000
Subject: [PATCH 06/27] GPT-4

---
 README.md                       | 4 ++--
 website/docs/Getting-Started.md | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index b465b2967f..54856a31a8 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,9 @@
 ## What is FLAML
 FLAML is a lightweight Python library that finds accurate machine
 learning models automatically, efficiently and economically. It frees users from selecting
-models and hyperparameters for each model. It can also be used to tune generic hyperparameters for large language models (LLM), MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on.
+models and hyperparameters for each model. It can also be used to tune generic hyperparameters for large foundational models, MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on.
 
-1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large language models such as the OpenAI GPT-3 models.
+1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large foundational models such as ChatGPT and GPT-4.
 1. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code).
 1. It supports fast automatic tuning, capable of handling complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
 hyperparameter optimization](https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function/#hyperparameter-optimization-algorithm)
diff --git a/website/docs/Getting-Started.md b/website/docs/Getting-Started.md
index c2d498ae9f..afcee636d2 100644
--- a/website/docs/Getting-Started.md
+++ b/website/docs/Getting-Started.md
@@ -7,10 +7,8 @@ learning models automatically, efficiently and economically. It frees users from
 
 ### Main Features
 
-1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large language models such as the OpenAI GPT-3 models.
-
+1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large foundational models such as ChatGPT and GPT-4.
 2. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code). Users can customize only when and what they need to, and leave the rest to the library.
-
 3. It supports fast and economical automatic tuning, capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
 hyperparameter optimization](Use-Cases/Tune-User-Defined-Function#hyperparameter-optimization-algorithm)
 and model selection method invented by Microsoft Research, and many followup [research studies](Research).

From 0b56c4d86a7a45f671ad4fc5ad9eaa3ffde55e66 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Mon, 3 Apr 2023 16:30:40 +0000
Subject: [PATCH 07/27] increase request timeout

---
 flaml/autogen/oai/completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index d2586b04dd..fb2ee4f0e9 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -100,9 +100,9 @@ class Completion:
     # retry after this many seconds
     retry_time = 10
     # fail a request after hitting RateLimitError for this many seconds
-    retry_timeout = 60
+    retry_timeout = 120
     # time out for request to openai server
-    request_timeout = 30
+    request_timeout = 60
 
     openai_completion_class = not ERROR and openai.Completion
     _total_cost = 0

From 99609a8a6e3cd1500e3e56306399ed109ef6cd5b Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Mon, 3 Apr 2023 19:23:34 +0000
Subject: [PATCH 08/27] name

---
 notebook/integrate_openai.ipynb               | 1232 -----------------
 ...egrate - OpenAI.md => AutoGen - OpenAI.md} |    4 +-
 2 files changed, 3 insertions(+), 1233 deletions(-)
 delete mode 100644 notebook/integrate_openai.ipynb
 rename website/docs/Examples/{Integrate - OpenAI.md => AutoGen - OpenAI.md} (98%)

diff --git a/notebook/integrate_openai.ipynb b/notebook/integrate_openai.ipynb
deleted file mode 100644
index 2cbd924a41..0000000000
--- a/notebook/integrate_openai.ipynb
+++ /dev/null
@@ -1,1232 +0,0 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
-    "\n",
-    "Licensed under the MIT License.\n",
-    "\n",
-    "# Use FLAML to Tune OpenAI Models\n",
-    "\n",
-    "FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of LLMs.\n",
-    "\n",
-    "In this notebook, we tune OpenAI models for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n",
-    "\n",
-    "## Requirements\n",
-    "\n",
-    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
-    "```bash\n",
-    "pip install flaml[openai]==1.1.3\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:36.910966Z",
-     "iopub.status.busy": "2023-02-24T23:25:36.910473Z",
-     "iopub.status.idle": "2023-02-24T23:25:36.914554Z",
-     "shell.execute_reply": "2023-02-24T23:25:36.914030Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install flaml[openai]==1.1.3 datasets"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Set your OpenAI key:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:36.917301Z",
-     "iopub.status.busy": "2023-02-24T23:25:36.917011Z",
-     "iopub.status.idle": "2023-02-24T23:25:36.923156Z",
-     "shell.execute_reply": "2023-02-24T23:25:36.922619Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "if \"OPENAI_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If you use Azure OpenAI, uncomment the following:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:36.925804Z",
-     "iopub.status.busy": "2023-02-24T23:25:36.925423Z",
-     "iopub.status.idle": "2023-02-24T23:25:36.928191Z",
-     "shell.execute_reply": "2023-02-24T23:25:36.927673Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# openai.api_type = \"azure\"\n",
-    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
-    "# openai.api_version = \"2023-03-15-preview\"  # change if necessary"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load dataset\n",
-    "\n",
-    "First, we load the humaneval dataset. The dataset contains 164 examples. We use the first 20 for tuning the generation hyperparameters and the remaining for evaluation. In each example, the \"prompt\" is the prompt string for eliciting the code generation, \"test\" is the Python code for unit test for the example, and \"entry_point\" is the function name to be tested."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:36.931255Z",
-     "iopub.status.busy": "2023-02-24T23:25:36.930838Z",
-     "iopub.status.idle": "2023-02-24T23:25:39.148799Z",
-     "shell.execute_reply": "2023-02-24T23:25:39.148113Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset openai_humaneval (/home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d025d7cf0bc3438ba290e24d97855d8f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75/cache-1e8448101c1b32e8.arrow\n"
-     ]
-    }
-   ],
-   "source": [
-    "import datasets\n",
-    "\n",
-    "seed = 41\n",
-    "data = datasets.load_dataset(\"openai_humaneval\")[\"test\"].shuffle(seed=seed)\n",
-    "n_tune_data = 20\n",
-    "tune_data = [\n",
-    "    {\n",
-    "        \"prompt\": data[x][\"prompt\"],\n",
-    "        \"test\": data[x][\"test\"],\n",
-    "        \"entry_point\": data[x][\"entry_point\"],\n",
-    "    }\n",
-    "    for x in range(n_tune_data)\n",
-    "]\n",
-    "test_data = [\n",
-    "    {\n",
-    "        \"prompt\": data[x][\"prompt\"],\n",
-    "        \"test\": data[x][\"test\"],\n",
-    "        \"entry_point\": data[x][\"entry_point\"],\n",
-    "    }\n",
-    "    for x in range(n_tune_data, len(data))\n",
-    "]\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "Check a tuning example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:39.152156Z",
-     "iopub.status.busy": "2023-02-24T23:25:39.151531Z",
-     "iopub.status.idle": "2023-02-24T23:25:39.155313Z",
-     "shell.execute_reply": "2023-02-24T23:25:39.154731Z"
-    },
-    "slideshow": {
-     "slide_type": "subslide"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "def compare(game,guess):\n",
-      "    \"\"\"I think we all remember that feeling when the result of some long-awaited\n",
-      "    event is finally known. The feelings and thoughts you have at that moment are\n",
-      "    definitely worth noting down and comparing.\n",
-      "    Your task is to determine if a person correctly guessed the results of a number of matches.\n",
-      "    You are given two arrays of scores and guesses of equal length, where each index shows a match. \n",
-      "    Return an array of the same length denoting how far off each guess was. If they have guessed correctly,\n",
-      "    the value is 0, and if not, the value is the absolute difference between the guess and the score.\n",
-      "    \n",
-      "    \n",
-      "    example:\n",
-      "\n",
-      "    compare([1,2,3,4,5,1],[1,2,3,4,2,-2]) -> [0,0,0,0,3,3]\n",
-      "    compare([0,5,0,0,0,4],[4,1,1,0,0,-2]) -> [4,4,1,0,0,6]\n",
-      "    \"\"\"\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tune_data[1][\"prompt\"])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is one example of the unit test code for verifying the correctness of the generated code:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:39.158398Z",
-     "iopub.status.busy": "2023-02-24T23:25:39.157766Z",
-     "iopub.status.idle": "2023-02-24T23:25:39.161396Z",
-     "shell.execute_reply": "2023-02-24T23:25:39.160797Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "def check(candidate):\n",
-      "\n",
-      "    # Check some simple cases\n",
-      "    assert candidate([1,2,3,4,5,1],[1,2,3,4,2,-2])==[0,0,0,0,3,3], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "    assert candidate([0,0,0,0,0,0],[0,0,0,0,0,0])==[0,0,0,0,0,0], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "    assert candidate([1,2,3],[-1,-2,-3])==[2,4,6], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "    assert candidate([1,2,3,5],[-1,2,3,4])==[2,0,0,1], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "\n",
-      "    # Check some edge cases that are easy to work out by hand.\n",
-      "    assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n",
-      "\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tune_data[1][\"test\"])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Define Success Metric\n",
-    "\n",
-    "Before we start tuning, we need to define the success metric we want to opotimize. For each code generation task, if one of the returned responses can pass the test, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks.\n",
-    "\n",
-    "### Define a code executor\n",
-    "\n",
-    "First, we write a simple code executor. The code executor takes the generated code and the test code as the input, and execute them with a timer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:39.164187Z",
-     "iopub.status.busy": "2023-02-24T23:25:39.163867Z",
-     "iopub.status.idle": "2023-02-24T23:25:39.169009Z",
-     "shell.execute_reply": "2023-02-24T23:25:39.168427Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import signal\n",
-    "import subprocess\n",
-    "import sys\n",
-    "\n",
-    "def timeout_handler(signum, frame):\n",
-    "    raise TimeoutError(\"Timed out!\")\n",
-    "\n",
-    "signal.signal(signal.SIGALRM, timeout_handler)\n",
-    "max_exec_time = 3  # seconds\n",
-    "\n",
-    "def execute_code(code):\n",
-    "    code = code.strip()\n",
-    "    with open(\"codetest.py\", \"w\") as fout:\n",
-    "        fout.write(code)\n",
-    "    try:\n",
-    "        signal.alarm(max_exec_time)\n",
-    "        result = subprocess.run(\n",
-    "            [sys.executable, \"codetest.py\"],\n",
-    "            stdout=subprocess.DEVNULL,\n",
-    "            stderr=subprocess.PIPE,\n",
-    "        )\n",
-    "        signal.alarm(0)\n",
-    "    except TimeoutError:\n",
-    "        return 0\n",
-    "    return int(result.returncode == 0)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This function will create a temp file \"codetest.py\" and execute it in a separate process. It allows for 3 seconds to finish that code.\n",
-    "\n",
-    "### Define a function to evaluate the success for a given program synthesis task"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:39.171752Z",
-     "iopub.status.busy": "2023-02-24T23:25:39.171347Z",
-     "iopub.status.idle": "2023-02-24T23:25:39.176343Z",
-     "shell.execute_reply": "2023-02-24T23:25:39.175510Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def success_metrics(responses, prompt, test, entry_point):\n",
-    "    \"\"\"Check if the task is successful.\n",
-    "\n",
-    "    Args:\n",
-    "        responses (list): The list of responses.\n",
-    "        prompt (str): The input prompt.\n",
-    "        test (str): The test code.\n",
-    "        entry_point (str): The name of the function.\n",
-    "\n",
-    "    Returns:\n",
-    "        dict: The success metrics.\n",
-    "    \"\"\"\n",
-    "    success_list = []\n",
-    "    n = len(responses)\n",
-    "    for i in range(n):\n",
-    "        response = responses[i]\n",
-    "        code = f\"{prompt}{response}\\n{test}\\ncheck({entry_point})\"\n",
-    "        succeed = execute_code(code)\n",
-    "        success_list.append(succeed)\n",
-    "    return {\n",
-    "        \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n",
-    "        \"success\": any(s for s in success_list),\n",
-    "    }\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "## Use the tuning data to find a good configuration\n",
-    "\n",
-    "### Import the oai and tune subpackages from flaml.\n",
-    "\n",
-    "FLAML has provided an API for hyperparameter optimization of OpenAI models: `oai.Completion.tune` and to make a request with the tuned config: `oai.Completion.create`. First, we import oai from flaml:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:39.179030Z",
-     "iopub.status.busy": "2023-02-24T23:25:39.178624Z",
-     "iopub.status.idle": "2023-02-24T23:25:40.584410Z",
-     "shell.execute_reply": "2023-02-24T23:25:40.583802Z"
-    },
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from flaml import oai, tune"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For (local) reproducibility and cost efficiency, we cache responses from OpenAI."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:40.587815Z",
-     "iopub.status.busy": "2023-02-24T23:25:40.587283Z",
-     "iopub.status.idle": "2023-02-24T23:25:40.590826Z",
-     "shell.execute_reply": "2023-02-24T23:25:40.590158Z"
-    },
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "oai.Completion.set_cache(seed)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n",
-    "\n",
-    "### Perform tuning\n",
-    "\n",
-    "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n",
-    "\n",
-    "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.02 means the target inference budget is 0.02 dollars, which translates to 1000 tokens (input + output combined) if the text Davinci model is used.\n",
-    "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 5 means 5 dollars are allowed in total, which translates to 250K tokens for the text Davinci model.\n",
-    "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n",
-    "\n",
-    "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
-    "\n",
-    "```python\n",
-    "default_search_space = {\n",
-    "    \"model\": tune.choice([\n",
-    "        \"text-ada-001\",\n",
-    "        \"text-babbage-001\",\n",
-    "        \"text-davinci-003\",\n",
-    "        \"gpt-3.5-turbo\",\n",
-    "        \"gpt-4\",\n",
-    "    ]),\n",
-    "    \"temperature_or_top_p\": tune.choice(\n",
-    "        [\n",
-    "            {\"temperature\": tune.uniform(0, 1)},\n",
-    "            {\"top_p\": tune.uniform(0, 1)},\n",
-    "        ]\n",
-    "    ),\n",
-    "    \"max_tokens\": tune.lograndint(50, 1000),\n",
-    "    \"n\": tune.randint(1, 100),\n",
-    "    \"prompt\": \"{prompt}\",\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "The default search space can be overridden by users' input.\n",
-    "For example, the following code specifies four choices for the prompt and a fixed list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used. If you don't have access to gpt-4 or would like to modify the choice of models, you can provide a different search space for model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:25:40.593603Z",
-     "iopub.status.busy": "2023-02-24T23:25:40.593269Z",
-     "iopub.status.idle": "2023-02-24T23:26:38.349191Z",
-     "shell.execute_reply": "2023-02-24T23:26:38.348392Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[32m[I 2023-03-26 02:53:26,384]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
-      "\u001b[32m[I 2023-03-26 02:53:26,387]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 03-26 02:53:26] {811} INFO - trial 1 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:29] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.09264000000000001, 'cost': 0.09264000000000001, 'inference_cost': 0.004632, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 3.5772321224212646}\n",
-      "[flaml.tune.tune: 03-26 02:53:29] {811} INFO - trial 2 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:30] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09429879999999999, 'cost': 0.0016588, 'inference_cost': 7.264e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5873167514801025}\n",
-      "[flaml.tune.tune: 03-26 02:53:30] {811} INFO - trial 3 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09782479999999999, 'cost': 0.003526, 'inference_cost': 0.00016342499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6068365573883057}\n",
-      "[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.2, 'success': 0.2, 'total_cost': 0.10643079999999999, 'cost': 0.008606, 'inference_cost': 0.0004394, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5878369808197021}\n",
-      "[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 5 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:32] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'total_cost': 0.2603308, 'cost': 0.15389999999999998, 'inference_cost': 0.007861499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6071126461029053}\n",
-      "[flaml.tune.tune: 03-26 02:53:32] {811} INFO - trial 6 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.2629064, 'cost': 0.0025756000000000004, 'inference_cost': 0.00011848, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.7605307121989587}, 'config/max_tokens': 82, 'config/n': 9, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.4761645793914795}\n",
-      "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 7 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.4132364, 'cost': 0.15033000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.14217004760152696}, 'config/max_tokens': 152, 'config/n': 67, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022079944610595703}\n",
-      "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 8 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.6260264, 'cost': 0.21278999999999998, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.30070005663620336}, 'config/max_tokens': 70, 'config/n': 83, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022161006927490234}\n",
-      "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 9 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:53:43] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6310854, 'cost': 0.005059, 'inference_cost': 0.00023457499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.16501589771914849}, 'config/max_tokens': 161, 'config/n': 10, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.868851661682129}\n",
-      "[flaml.tune.tune: 03-26 02:53:43] {811} INFO - trial 10 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:54:05] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6344234000000001, 'cost': 0.003338, 'inference_cost': 0.0001522, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.5902013629854229}, 'config/max_tokens': 56, 'config/n': 36, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 21.348156690597534}\n",
-      "[flaml.tune.tune: 03-26 02:54:05] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0.3476191678990812, 'success': 0.35, 'total_cost': 0.7530034000000003, 'cost': 0.11858000000000002, 'inference_cost': 0.005490999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.763240587143681}, 'config/max_tokens': 693, 'config/n': 42, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 28.24349284172058}\n",
-      "[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 12 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.9340534000000004, 'cost': 0.18105, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.2927979762895091}, 'config/max_tokens': 60, 'config/n': 97, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.002497434616088867}\n",
-      "[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 13 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:54:35] {215} INFO - result: {'expected_success': 0.28359375, 'success': 0.35, 'total_cost': 0.9496594000000004, 'cost': 0.015605999999999998, 'inference_cost': 0.0007894, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.7186028103822503}, 'config/max_tokens': 288, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 2.29030704498291}\n",
-      "[flaml.tune.tune: 03-26 02:54:35] {811} INFO - trial 14 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.9550898000000005, 'cost': 0.0054304, 'inference_cost': 0.00026122, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'top_p': 0.3653649712141158}, 'config/max_tokens': 96, 'config/n': 75, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 44.837317943573}\n",
-      "[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 15 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0, 'total_cost': 1.0798498000000005, 'cost': 0.12475999999999998, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.3814115349046321}, 'config/max_tokens': 791, 'config/n': 92, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0024149417877197266}\n",
-      "[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 16 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:55:29] {215} INFO - result: {'expected_success': 0.5484931390416686, 'success': 0.55, 'total_cost': 1.1118038000000003, 'cost': 0.031954, 'inference_cost': 0.0015885000000000003, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4284507389678964}, 'config/max_tokens': 398, 'config/n': 11, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 9.271101951599121}\n",
-      "[flaml.tune.tune: 03-26 02:55:29] {811} INFO - trial 17 config: {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}\n",
-      "[flaml.tune.tune: 03-26 02:55:46] {215} INFO - result: {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n",
-      "[flaml.tune.tune: 03-26 02:55:46] {811} INFO - trial 18 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:56:05] {215} INFO - result: {'expected_success': 0.3551828400470255, 'success': 0.4, 'total_cost': 2.1919698000000003, 'cost': 0.061546, 'inference_cost': 0.0030944, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9761031076386442}, 'config/max_tokens': 349, 'config/n': 23, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 19.451276063919067}\n",
-      "[flaml.tune.tune: 03-26 02:56:05] {811} INFO - trial 19 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-26 02:56:18] {215} INFO - result: {'expected_success': 0.2898979473186428, 'success': 0.35, 'total_cost': 2.2507018000000003, 'cost': 0.058732, 'inference_cost': 0.0029537, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9822374507369328}, 'config/max_tokens': 393, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 13.075204372406006}\n",
-      "[flaml.tune.tune: 03-26 02:56:18] {811} INFO - trial 20 config: {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}\n",
-      "[flaml.tune.tune: 03-26 02:56:19] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 2.4012418000000006, 'cost': 0.15053999999999995, 'inference_cost': 0.007693499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}, 'config/model': 'gpt-4', 'config/max_tokens': 348, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'experiment_tag': 'exp', 'time_total_s': 0.6143312454223633}\n",
-      "[flaml.tune.tune: 03-26 02:56:19] {811} INFO - trial 21 config: {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}\n",
-      "[flaml.tune.tune: 03-26 02:56:32] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4043414000000016, 'cost': 0.0030996000000000005, 'inference_cost': 0.00014468, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}, 'config/model': 'text-ada-001', 'config/max_tokens': 130, 'config/n': 22, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.22084263211180838}, 'experiment_tag': 'exp', 'time_total_s': 13.137321710586548}\n",
-      "[flaml.tune.tune: 03-26 02:56:32] {811} INFO - trial 22 config: {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}\n",
-      "[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4061918000000024, 'cost': 0.0018504, 'inference_cost': 8.222e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}, 'config/model': 'text-ada-001', 'config/max_tokens': 342, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.2881152790307279}, 'experiment_tag': 'exp', 'time_total_s': 2.4484035968780518}\n",
-      "[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 23 config: {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}\n",
-      "[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.618831800000003, 'cost': 0.21264, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}, 'config/model': 'gpt-4', 'config/max_tokens': 253, 'config/n': 23, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.41254458573656}, 'experiment_tag': 'exp', 'time_total_s': 0.003139972686767578}\n",
-      "[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 24 config: {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}\n",
-      "[flaml.tune.tune: 03-26 02:56:36] {215} INFO - result: {'expected_success': 0.8185185185185185, 'success': 0.85, 'total_cost': 2.912231800000003, 'cost': 0.29339999999999994, 'inference_cost': 0.014836499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}, 'config/model': 'gpt-4', 'config/max_tokens': 176, 'config/n': 3, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.0964133254059763}, 'experiment_tag': 'exp', 'time_total_s': 1.8556303977966309}\n",
-      "[flaml.tune.tune: 03-26 02:56:36] {811} INFO - trial 25 config: {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}\n",
-      "[flaml.tune.tune: 03-26 02:56:55] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9569863000000023, 'cost': 0.04475450000000001, 'inference_cost': 0.00222485, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 343, 'config/n': 27, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24286268913046594}, 'experiment_tag': 'exp', 'time_total_s': 19.013901472091675}\n",
-      "[flaml.tune.tune: 03-26 02:56:55] {811} INFO - trial 26 config: {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}\n",
-      "[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9595088000000023, 'cost': 0.0025224999999999996, 'inference_cost': 0.00011325, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 130, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.26609522201207036}, 'experiment_tag': 'exp', 'time_total_s': 0.5786199569702148}\n",
-      "[flaml.tune.tune: 03-26 02:56:56] {811} INFO - trial 27 config: {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}\n",
-      "[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0, 'total_cost': 3.0123088000000022, 'cost': 0.05279999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}, 'config/model': 'gpt-4', 'config/max_tokens': 212, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24802150727233283}, 'experiment_tag': 'exp', 'time_total_s': 0.0019483566284179688}\n",
-      "[flaml.tune.tune: 03-26 02:56:56] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
-     ]
-    }
-   ],
-   "source": [
-    "config, analysis = oai.Completion.tune(\n",
-    "    data=tune_data,  # the data for tuning\n",
-    "    metric=\"expected_success\",  # the metric to optimize\n",
-    "    mode=\"max\",  # the optimization mode\n",
-    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
-    "    # log_file_name=\"logs/humaneval.log\",  # the log file name\n",
-    "    inference_budget=0.05,  # the inference budget (dollar)\n",
-    "    optimization_budget=3,  # the optimization budget (dollar)\n",
-    "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
-    "    # -1 means decided by the optimization budget only\n",
-    "    num_samples=-1,\n",
-    "    prompt=[\n",
-    "        \"{prompt}\",\n",
-    "        \"# Python 3{prompt}\",\n",
-    "        \"Complete the following Python function:{prompt}\",\n",
-    "        \"Complete the following Python function while including necessary import statements inside the function:{prompt}\",\n",
-    "    ],  # the prompt templates to choose from\n",
-    "    stop=[\"\\nclass\", \"\\ndef\", \"\\nif\", \"\\nprint\"],  # the stop sequence\n",
-    ")\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Output tuning results\n",
-    "\n",
-    "After the tuning, we can print out the config and the result found by FLAML:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:26:38.352710Z",
-     "iopub.status.busy": "2023-02-24T23:26:38.352378Z",
-     "iopub.status.idle": "2023-02-24T23:26:38.356939Z",
-     "shell.execute_reply": "2023-02-24T23:26:38.356217Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "optimized config {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': '# Python 3{prompt}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'temperature': 0.25447895557126815}\n",
-      "best result on tuning data {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"optimized config\", config)\n",
-    "print(\"best result on tuning data\", analysis.best_result)\n",
-    "\n",
-    "# save results to notebook_output.txt\n",
-    "from flaml.version import __version__ as flaml_version\n",
-    "import datetime\n",
-    "results = {\"optimized config\": config, \"best result on tuning data\": analysis.best_result,}\n",
-    "result_info_dict = {\"result_name\": \"integrate_openai.ipynb + optimized config and best result on tuning data\",\n",
-    "                    \"flaml_version\": flaml_version, \n",
-    "                    \"time\": datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\"),\n",
-    "                     \"results\": results}\n",
-    "result_info = \"result name: {result_name}, flaml version: {flaml_version}, time: {time}, results: {results}\".format(**result_info_dict)\n",
-    "with open(\"notebook_output.txt\", \"a\") as f:\n",
-    "    f.write(\"\\n\")\n",
-    "    f.write(result_info)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "### Make a request with the tuned config\n",
-    "\n",
-    "We can apply the tuned config on the request for an example task:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:26:38.359902Z",
-     "iopub.status.busy": "2023-02-24T23:26:38.359506Z",
-     "iopub.status.idle": "2023-02-24T23:26:39.343921Z",
-     "shell.execute_reply": "2023-02-24T23:26:39.343051Z"
-    },
-    "slideshow": {
-     "slide_type": "subslide"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "  \"choices\": [\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 0,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 1,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 2,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 3,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 4,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 5,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 6,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        if game[i] == guess[i]:\\n            result.append(0)\\n        else:\\n            result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 7,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 8,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    return [abs(a - b) for a, b in zip(game, guess)]\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 9,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 10,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 11,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 12,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        if game[i] == guess[i]:\\n            result.append(0)\\n        else:\\n            result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    }\n",
-      "  ],\n",
-      "  \"created\": 1679636800,\n",
-      "  \"id\": \"chatcmpl-6xUY4niTRrpJ5UShayb9QncgjS8rg\",\n",
-      "  \"model\": \"gpt-4-0314\",\n",
-      "  \"object\": \"chat.completion\",\n",
-      "  \"usage\": {\n",
-      "    \"completion_tokens\": 440,\n",
-      "    \"prompt_tokens\": 236,\n",
-      "    \"total_tokens\": 676\n",
-      "  }\n",
-      "}\n",
-      "{'expected_success': 1.0, 'success': True}\n"
-     ]
-    }
-   ],
-   "source": [
-    "responses = oai.Completion.create(context=tune_data[1], **config)\n",
-    "metric_results = success_metrics([response[\"message\"][\"content\"] if config[\"model\"] in oai.Completion.chat_models else response[\"text\"] for response in responses[\"choices\"]], **tune_data[1])\n",
-    "print(\"response on an example data instance:\", responses)\n",
-    "print(\"metric_results on the example data instance:\", metric_results)\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Evaluate the success rate on the test data\n",
-    "\n",
-    "You can use flaml's `oai.Completion.test` to evaluate the performance of an entire dataset with the tuned config. The following code will take a while to evaluate all the 144 test data instances. The cost is about $7 if you uncomment it and run it."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-24T23:26:39.347295Z",
-     "iopub.status.busy": "2023-02-24T23:26:39.346994Z",
-     "iopub.status.idle": "2023-02-24T23:29:27.160335Z",
-     "shell.execute_reply": "2023-02-24T23:29:27.159519Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'expected_success': 0.8326778348739547, 'success': 0.8472222222222222, 'total_cost': 10.024478799999999, 'cost': 7.01217, 'inference_cost': 0.049131249999999994}\n"
-     ]
-    }
-   ],
-   "source": [
-    "result = oai.Completion.test(test_data, config, success_metrics)\n",
-    "print(\"performance on test data with the tuned config:\", result)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The result will vary with the inference budget and optimization budget.\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "tutorial",
-   "language": "python",
-   "name": "tutorial"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
-   }
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "24dd93300e0442788ee6cc1310e5bf14": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "background": null,
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "35cd066a31b242bb87b2c106ee72e5f2": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HBoxModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HBoxView",
-       "box_style": "",
-       "children": [
-        "IPY_MODEL_8e7ee7687a99410d88a98a74ecfcea99",
-        "IPY_MODEL_421e02a11a974b40b3ddb75382b3b640",
-        "IPY_MODEL_77db9797e78b49438d21c5c8da34b4cb"
-       ],
-       "layout": "IPY_MODEL_47d3046236a54b0e8f9ae455a82c7e0b",
-       "tabbable": null,
-       "tooltip": null
-      }
-     },
-     "3d5d106a38954af2bb3bde5777702f4e": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "background": null,
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "3e1ebb31412443b0bca86a301cbdac11": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "ProgressStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "bar_color": null,
-       "description_width": ""
-      }
-     },
-     "421e02a11a974b40b3ddb75382b3b640": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "FloatProgressModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "ProgressView",
-       "bar_style": "success",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_e6398d4027c9459a97965b9d91ae484f",
-       "max": 1,
-       "min": 0,
-       "orientation": "horizontal",
-       "style": "IPY_MODEL_3e1ebb31412443b0bca86a301cbdac11",
-       "tabbable": null,
-       "tooltip": null,
-       "value": 1
-      }
-     },
-     "47d3046236a54b0e8f9ae455a82c7e0b": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "754800f7feb04acea977696e4787d1ff": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "77db9797e78b49438d21c5c8da34b4cb": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HTMLView",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_7b6c4e1c11e249409a1edcd63be450d8",
-       "placeholder": "​",
-       "style": "IPY_MODEL_3d5d106a38954af2bb3bde5777702f4e",
-       "tabbable": null,
-       "tooltip": null,
-       "value": " 1/1 [00:00&lt;00:00, 44.40it/s]"
-      }
-     },
-     "7b6c4e1c11e249409a1edcd63be450d8": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "8e7ee7687a99410d88a98a74ecfcea99": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HTMLView",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_754800f7feb04acea977696e4787d1ff",
-       "placeholder": "​",
-       "style": "IPY_MODEL_24dd93300e0442788ee6cc1310e5bf14",
-       "tabbable": null,
-       "tooltip": null,
-       "value": "100%"
-      }
-     },
-     "e6398d4027c9459a97965b9d91ae484f": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     }
-    },
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/website/docs/Examples/Integrate - OpenAI.md b/website/docs/Examples/AutoGen - OpenAI.md
similarity index 98%
rename from website/docs/Examples/Integrate - OpenAI.md
rename to website/docs/Examples/AutoGen - OpenAI.md
index 23c30c16c5..9349bfe149 100644
--- a/website/docs/Examples/Integrate - OpenAI.md	
+++ b/website/docs/Examples/AutoGen - OpenAI.md	
@@ -1,4 +1,4 @@
-FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of the OpenAI API.
+FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of them.
 In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task.
 
 ### Prerequisites
@@ -76,6 +76,8 @@ The tuning will be performed under the specified optimization budgets.
 Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.
 
 ```python
+from flaml import oai
+
 config, analysis = oai.Completion.tune(
     data=tune_data,  # the data for tuning
     metric="success",  # the metric to optimize

From 1d249e2c2e238d096662a15db0398a8ef20baba5 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Tue, 4 Apr 2023 11:09:46 +0000
Subject: [PATCH 09/27] logging and error handling

---
 flaml/autogen/oai/completion.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index fb2ee4f0e9..bfd9a8caae 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -3,7 +3,9 @@
 import numpy as np
 import time
 from typing import List
+import sys
 from flaml import tune, BlendSearch
+from flaml.automl.logger import logger_formatter
 
 try:
     import openai
@@ -23,6 +25,11 @@
         "please install flaml[openai] option to use the flaml.oai subpackage."
     )
 logger = logging.getLogger(__name__)
+if not logger.handlers:
+    # Add the console handler.
+    _ch = logging.StreamHandler(stream=sys.stdout)
+    _ch.setFormatter(logger_formatter)
+    logger.addHandler(_ch)
 
 
 def get_key(config):
@@ -158,7 +165,9 @@ def _get_response(cls, config: dict, eval_only=False, use_cache=True):
                 # retry after retry_time seconds
                 if time.time() - start_time + cls.retry_time < cls.retry_timeout:
                     logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1)
-                elif not eval_only:
+                elif eval_only:
+                    raise
+                else:
                     break
                 sleep(cls.retry_time)
             except InvalidRequestError:
@@ -503,6 +512,7 @@ def eval_func(responses, **data):
             num_samples (int, optional): The number of samples to evaluate.
                 -1 means no hard restriction in the number of trials
                 and the actual number is decided by optimization_budget. Defaults to 1.
+            logging_level (optional): logging level. Defaults to logging.WARNING.
             **config (dict): The search space to update over the default search.
                 For prompt, please provide a string/Callable or a list of strings/Callables.
                     - If prompt is provided for chat models, it will be converted to messages under role "user".
@@ -626,6 +636,7 @@ def eval_func(responses, **data):
                 mode=mode,
                 space=space,
             )
+        old_level = logger.getEffectiveLevel()
         logger.setLevel(logging_level)
         with diskcache.Cache(cls.cache_path) as cls._cache:
             analysis = tune.run(
@@ -646,6 +657,7 @@ def eval_func(responses, **data):
         temperature_or_top_p = params.pop("temperature_or_top_p", None)
         if temperature_or_top_p:
             params.update(temperature_or_top_p)
+        logger.setLevel(old_level)
         return params, analysis
 
     @classmethod
@@ -731,6 +743,7 @@ def test(
         use_cache=True,
         agg_method="avg",
         return_responses_and_per_instance_result=False,
+        logging_level=logging.WARNING,
     ):
         """Evaluate the responses created with the config for the OpenAI API call.
 
@@ -781,6 +794,7 @@ def eval_func(responses, **data):
 
             return_responses_and_per_instance_result (bool): Whether to also return responses
                 and per instance results in addition to the aggregated results.
+            logging_level (optional): logging level. Defaults to logging.WARNING.
 
         Returns:
             None when no valid eval_func is provided in either test or tune;
@@ -791,6 +805,8 @@ def eval_func(responses, **data):
         metric_keys = None
         cost = 0
         model = config["model"]
+        old_level = logger.getEffectiveLevel()
+        logger.setLevel(logging_level)
         for i, data_i in enumerate(data):
             logger.info(f"evaluating data instance {i}")
             response = cls.create(data_i, use_cache, **config)
@@ -843,6 +859,7 @@ def eval_func(responses, **data):
                 "agg_method needs to be a string ('avg' or 'median'),\
                 or a callable, or a dictionary of callable."
             )
+        logger.setLevel(old_level)
         # should we also return the result_list and responses_list or not?
         if "cost" not in result_agg:
             result_agg["cost"] = cost

From 2485ea447b0596d436979ae84c864f3d1510a52d Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Tue, 4 Apr 2023 17:52:22 +0000
Subject: [PATCH 10/27] better doc

---
 README.md                                 | 4 ++--
 flaml/autogen/oai/completion.py           | 2 +-
 website/docs/Examples/AutoGen - OpenAI.md | 2 +-
 website/docs/Getting-Started.md           | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 54856a31a8..b906bf6742 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,9 @@
 ## What is FLAML
 FLAML is a lightweight Python library that finds accurate machine
 learning models automatically, efficiently and economically. It frees users from selecting
-models and hyperparameters for each model. It can also be used to tune generic hyperparameters for large foundational models, MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on.
+models and hyperparameters for each model. It can also be used to tune generic hyperparameters for foundation models, MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on.
 
-1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large foundational models such as ChatGPT and GPT-4.
+1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as ChatGPT and GPT-4.
 1. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code).
 1. It supports fast automatic tuning, capable of handling complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
 hyperparameter optimization](https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function/#hyperparameter-optimization-algorithm)
diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index bfd9a8caae..dbed44bb66 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -899,7 +899,7 @@ def extract_text(cls, response: dict) -> List[str]:
             response (dict): The response from OpenAI API.
 
         Returns:
-            The text.
+            A list of text in the responses.
         """
         choices = response["choices"]
         if "text" in choices[0]:
diff --git a/website/docs/Examples/AutoGen - OpenAI.md b/website/docs/Examples/AutoGen - OpenAI.md
index 9349bfe149..e048dd4c89 100644
--- a/website/docs/Examples/AutoGen - OpenAI.md	
+++ b/website/docs/Examples/AutoGen - OpenAI.md	
@@ -54,7 +54,7 @@ test_data = [
 
 ### Defining the metric
 
-Before starting tuning, you need to define the metric for the optimization. For each code generation task, we can use the model to generate multiple candidates, and then select one from them. If the final selected response can pass a unit test, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks.
+Before starting tuning, you need to define the metric for the optimization. For each code generation task, we can use the model to generate multiple candidate responses, and then select one from them. If the final selected response can pass a unit test, we consider the task as successfully solved. Then we can define the average success rate on a collection of tasks as the optimization metric.
 
 ```python
 from functools import partial
diff --git a/website/docs/Getting-Started.md b/website/docs/Getting-Started.md
index afcee636d2..aa8c86f7d4 100644
--- a/website/docs/Getting-Started.md
+++ b/website/docs/Getting-Started.md
@@ -7,7 +7,7 @@ learning models automatically, efficiently and economically. It frees users from
 
 ### Main Features
 
-1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large foundational models such as ChatGPT and GPT-4.
+1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as ChatGPT and GPT-4.
 2. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code). Users can customize only when and what they need to, and leave the rest to the library.
 3. It supports fast and economical automatic tuning, capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
 hyperparameter optimization](Use-Cases/Tune-User-Defined-Function#hyperparameter-optimization-algorithm)

From 50b39d114e1df1c50e2a76bb904f6576d35cf9e7 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Wed, 5 Apr 2023 04:44:17 +0000
Subject: [PATCH 11/27] doc

---
 README.md                                     |  16 +++
 ...{AutoGen - OpenAI.md => AutoGen-OpenAI.md} |   2 +
 website/docs/Getting-Started.md               |  20 +++
 website/docs/Use-Cases/Auto-Generation.md     | 117 ++++++++++++++++++
 4 files changed, 155 insertions(+)
 rename website/docs/Examples/{AutoGen - OpenAI.md => AutoGen-OpenAI.md} (99%)
 create mode 100644 website/docs/Use-Cases/Auto-Generation.md

diff --git a/README.md b/README.md
index b906bf6742..edc0dd1a6c 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,22 @@ estimator = LGBMRegressor()
 estimator.fit(X_train, y_train)
 ```
 
+* (New) You can optimize [generations](https://microsoft.github.io/FLAML/docs/Use-Cases/Auto-Generation) by ChatGPT or GPT-4 etc. with your own tuning data, success metrics and budgets.
+
+```python
+from flaml import oai
+
+config, analysis = oai.Completion.tune(
+    data=tune_data,
+    metric="success",
+    mode="max",
+    eval_func=success_metrics,
+    inference_budget=0.05,
+    optimization_budget=3,
+    num_samples=-1,
+)
+```
+
 ## Documentation
 
 You can find a detailed documentation about FLAML [here](https://microsoft.github.io/FLAML/) where you can find the API documentation, use cases and examples.
diff --git a/website/docs/Examples/AutoGen - OpenAI.md b/website/docs/Examples/AutoGen-OpenAI.md
similarity index 99%
rename from website/docs/Examples/AutoGen - OpenAI.md
rename to website/docs/Examples/AutoGen-OpenAI.md
index e048dd4c89..7419718b18 100644
--- a/website/docs/Examples/AutoGen - OpenAI.md	
+++ b/website/docs/Examples/AutoGen-OpenAI.md
@@ -1,3 +1,5 @@
+# AutoGen - OpenAI
+
 FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of them.
 In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task.
 
diff --git a/website/docs/Getting-Started.md b/website/docs/Getting-Started.md
index aa8c86f7d4..f8f3ac0069 100644
--- a/website/docs/Getting-Started.md
+++ b/website/docs/Getting-Started.md
@@ -86,6 +86,26 @@ from flaml.default import LGBMClassifier
 
 Then, you can use it just like you use the original `LGMBClassifier`. Your other code can remain unchanged. When you call the `fit()` function from `flaml.default.LGBMClassifier`, it will automatically instantiate a good data-dependent hyperparameter configuration for your dataset, which is expected to work better than the default configuration.
 
+#### (New) [Auto Generation](Use-Cases/Auto-Generation)
+
+You can optimize generations by ChatGPT or GPT-4 etc. with your own tuning data, success metrics and budgets.
+
+```python
+from flaml import oai
+
+config, analysis = oai.Completion.tune(
+    data=tune_data,
+    metric="success",
+    mode="max",
+    eval_func=success_metrics,
+    inference_budget=0.05,
+    optimization_budget=3,
+    num_samples=-1,
+)
+```
+
+The optimization can help you maximize the utility out of these expensive models.
+
 ### Where to Go Next?
 
 * Understand the use cases for [Task-oriented AutoML](Use-Cases/task-oriented-automl), [Tune user-defined function](Use-Cases/Tune-User-Defined-Function) and [Zero-shot AutoML](Use-Cases/Zero-Shot-AutoML).
diff --git a/website/docs/Use-Cases/Auto-Generation.md b/website/docs/Use-Cases/Auto-Generation.md
new file mode 100644
index 0000000000..c901cd18ab
--- /dev/null
+++ b/website/docs/Use-Cases/Auto-Generation.md
@@ -0,0 +1,117 @@
+# Auto Generation
+
+`flaml.autogen` is a subpackage for automating generation tasks. It uses [`flaml.tune`](../reference/tune/tune) to find good hyperparameter configurations under budget constraints.
+Such optimization has several benefits:
+* Maximize the utility out of using expensive foundation models.
+* Reduce the inference cost by using cheaper models or configurations which achieve equal or better performance.
+
+## Choices to Optimize
+
+The cost of using foundation models for text generation is typically measured in terms of the number of tokens in the input and output combined. From the perspective of an application builder using foundation models, the use case is to maximize the utility of the generated text under an inference budget constraint (e.g., measured by the average dollar cost needed to solve a coding problem). This can be achieved by optimizing the hyperparameters of the inference,
+which can significantly affect both the utility and the cost of the generated text.
+
+The tunable hyperparameters include:
+1. model - this is a required input, specifying the model ID to use.
+1. prompt - the input prompt to the model, which provides the context for the text generation task.
+1. max_tokens - the maximum number of tokens (words or word pieces) to generate in the output.
+1. temperature - a value between 0 and 1 that controls the randomness of the generated text. A higher temperature will result in more random and diverse text, while a lower temperature will result in more predictable text.
+1. top_p - a value between 0 and 1 that controls the sampling probability mass for each token generation. A lower top_p value will make it more likely to generate text based on the most likely tokens, while a higher value will allow the model to explore a wider range of possible tokens.
+1. n - the number of responses to generate for a given prompt. Generating multiple responses can provide more diverse and potentially more useful output, but it also increases the cost of the request.
+1. stop - a list of strings that, when encountered in the generated text, will cause the generation to stop. This can be used to control the length or the validity of the output.
+1. presence_penalty, frequency_penalty - values that control the relative importance of the presence and frequency of certain words or phrases in the generated text.
+1. best_of - the number of responses to generate server-side when selecting the "best" (the one with the highest log probability per token) response for a given prompt.
+
+The cost and utility of text generation are intertwined with the joint effect of these hyperparameters.
+There are also complex interactions among subsets of the hyperparameters. For example,
+the temperature and top_p are not recommended to be altered from their default values together because they both control the randomness of the generated text, and changing both at the same time can result in conflicting effects; n and best_of are rarely tuned together because if the application can process multiple outputs, filtering on the server side causes unnecessary information loss; both n and max_tokens will affect the total number of tokens generated, which in turn will affect the cost of the request.
+These interactions and trade-offs make it difficult to manually determine the optimal hyperparameter settings for a given text generation task.
+
+## Tune Hyperparameters
+
+The tuning can be performed with the following information:
+1. Validation data.
+1. Evaluation function.
+1. Metric to optimize.
+1. Search space.
+1. Budgets: inference and optimization respectively.
+
+### Validation data
+
+Collect a diverse set of instances. They can be stored in an iterable of dicts. For example, each instance dict can contain "problem" as a key and the description str of a math problem as the value; and "solution" as a key and the solution str as the value.
+
+### Evaluation function
+
+The evaluation function should take a list of responses, and other keyword arguments corresponding to the keys in each validation data instance as input, and output a dict of metrics. For example,
+
+```python
+def success_metrics(responses: List[str], problem: str, solution: str) -> Dict:
+    # select a response from the list of responses
+    # check whether the answer is correct
+    return {"success": True or False}
+```
+
+`flaml.autogen` offers some example evaluation functions for common tasks such as code generation and math problem solving.
+
+### Metric to optimize
+
+The metric to optimize is usually an aggregated metric over all the tuning data instances. For example, users can specify "success" as the metric and "max" as the optimization mode. By default, the aggregation function is taking the average. Users can provide a customized aggregation function if needed.
+
+### Search space
+
+Users can specify the (optional) search range for each hyperparameter.
+
+1. model. Either a constant str, or multiple choices specified by `flaml.tune.choice`.
+1. prompt. Either a str or a list of strs, of the prompt templates.
+Each prompt template will be formatted with each data instance. For example, the prompt template can be:
+"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\boxed{{}}."
+And `{problem}` will be replaced by the "problem" field of each data instance.
+1. max_tokens, n, best_of. They can be constants, or specified by `flaml.tune.randint`, `flaml.tune.qrandint`, `flaml.tune.lograndint` or `flaml.qlograndint`. By default, max_tokens is searched in [50, 1000); n is searched in [1, 100); and best_of is fixed to 1.
+1. stop. It can be a str or a list of strs, or a list of lists of strs or None. Default is None.
+1. temperature or top_p. One of them can be specified as a constant or by `flaml.tune.uniform` or `flaml.tune.loguniform` etc.
+Please don't provide both. By default, each configuration will choose either a temperature or a top_p in [0, 1] uniformly.
+1. presence_penalty, frequency_penalty. They can be constants or specified by `flaml.tune.uniform` etc. Not tuned by default.
+
+### Budgets
+
+One can specify an inference budget and an optimization budget.
+The inference budget refers to the average inference cost per data instance.
+The optimization budget refers to the total budget allowed in the tuning process. Both are measured by dollars and follow the price per 1000 tokens.
+
+### Perform tuning
+
+Now, you can use [`flaml.oai.Completion.tune`](../reference/autogen/oai/completion#tune) for tuning. For example,
+
+```python
+from flaml import oai
+
+config, analysis = oai.Completion.tune(
+    data=tune_data,
+    metric="success",
+    mode="max",
+    eval_func=success_metrics,
+    inference_budget=0.05,
+    optimization_budget=3,
+    num_samples=-1,
+)
+```
+
+`num_samples` is the number of configurations to sample. -1 means unlimited (until optimization budget is exhausted).
+The returned `config` contains the optimized configuration and `analysis` contains an [ExperimentAnalysis](../reference/tune/analysis#experimentanalysis-objects) object for all the tried configurations and results.
+
+### Perform inference with the tuned config
+
+One can use [`flaml.oai.Completion.create`](../reference/autogen/oai/completion#create) to performance inference. It materializes a prompt using a given context. For example,
+
+```python
+response = oai.Completion.create(problme=problem, **config)
+responses = oai.Completion.extract_test(response)
+# Extract a list of str responses
+```
+
+`flaml.oai.Completion` is compatible with both `openai.Completion` and `openai.ChatCompletion`. So models such as "text-davinci-003", "gpt-3.5-turbo" and "gpt-4" can share a common API. When only tuning the chat-based models, `flaml.oai.ChatCompletion` can be used.
+
+`flaml.oai.Completion` also offers some additional utilities including a `test` function to conveniently evaluate the configuration over test data, a `cost` function to calculate the cost of an API call, and caching and error handling. It also supports both OpenAI API and Azure OpenAI API.
+
+Interested in trying it yourself? Please check the following notebook examples:
+* [Optimize for Code Gen](https://github.com/microsoft/FLAML/blob/main/notebook/autogen_openai.ipynb)
+* [Optimize for Math](https://github.com/microsoft/FLAML/blob/main/notebook/autogen_chatgpt.ipynb)

From b2f7361c00356197f9a20295a53a0f0909ad458b Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Wed, 5 Apr 2023 21:36:51 +0000
Subject: [PATCH 12/27] codegen optimized

---
 flaml/autogen/code_utils.py          |  11 +-
 notebook/research/autogen_code.ipynb | 785 +++++++++++++++++++++++++++
 test/openai/test_completion.py       |   2 +-
 3 files changed, 794 insertions(+), 4 deletions(-)
 create mode 100644 notebook/research/autogen_code.ipynb

diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
index 6392e5acd8..e62b170c2a 100644
--- a/flaml/autogen/code_utils.py
+++ b/flaml/autogen/code_utils.py
@@ -60,7 +60,7 @@ def generate_assertions(
 def _remove_check(response):
     """Remove the check function from the response."""
     # find the position of the check function
-    pos = response.find("def check")
+    pos = response.find("def check(")
     if pos == -1:
         return response
     return response[:pos]
@@ -92,7 +92,11 @@ def success_metrics(
         success_list = []
         for i in range(n):
             response = _remove_check(responses[i])
-            code = f"{definition}{response}\n{test}\ncheck({entry_point})"
+            code = (
+                f"{response}\n{test}\ncheck({entry_point})"
+                if response.startswith("def")
+                else f"{definition}{response}\n{test}\ncheck({entry_point})"
+            )
             success = execute_code(code)
             success_list.append(success)
         return {
@@ -150,6 +154,7 @@ def implement(definition: str, configs: List[Dict]) -> Tuple[str, float]:
     Returns:
         str: The implementation.
         float: The cost of the implementation.
+        int: The index of the configuration which generates the implementation.
     """
     assertions, cost = generate_assertions(definition)
     for i, config in enumerate(configs):
@@ -158,4 +163,4 @@ def implement(definition: str, configs: List[Dict]) -> Tuple[str, float]:
         responses = oai.Completion.extract_text(response)
         metrics = success_metrics(responses, definition, assertions=assertions)
         if metrics["succeed_assertions"] or i == len(configs) - 1:
-            return responses[metrics["index_selected"]], cost
+            return responses[metrics["index_selected"]], cost, i
diff --git a/notebook/research/autogen_code.ipynb b/notebook/research/autogen_code.ipynb
new file mode 100644
index 0000000000..9b0eadfb35
--- /dev/null
+++ b/notebook/research/autogen_code.ipynb
@@ -0,0 +1,785 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+    "\n",
+    "Licensed under the MIT License.\n",
+    "\n",
+    "# Use FLAML to Optimize Code Generation Performance\n",
+    "\n",
+    "In this notebook, we optimize OpenAI models for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n",
+    "\n",
+    "## Requirements\n",
+    "\n",
+    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
+    "```bash\n",
+    "pip install flaml[openai]==1.2.0\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-24T23:25:36.910966Z",
+     "iopub.status.busy": "2023-02-24T23:25:36.910473Z",
+     "iopub.status.idle": "2023-02-24T23:25:36.914554Z",
+     "shell.execute_reply": "2023-02-24T23:25:36.914030Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install flaml[openai]==1.2.0 datasets"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set your OpenAI key:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-24T23:25:36.917301Z",
+     "iopub.status.busy": "2023-02-24T23:25:36.917011Z",
+     "iopub.status.idle": "2023-02-24T23:25:36.923156Z",
+     "shell.execute_reply": "2023-02-24T23:25:36.922619Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "if \"OPENAI_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you use Azure OpenAI, uncomment the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-24T23:25:36.925804Z",
+     "iopub.status.busy": "2023-02-24T23:25:36.925423Z",
+     "iopub.status.idle": "2023-02-24T23:25:36.928191Z",
+     "shell.execute_reply": "2023-02-24T23:25:36.927673Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# openai.api_type = \"azure\"\n",
+    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
+    "# openai.api_version = \"2023-03-15-preview\"  # change if necessary"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load dataset\n",
+    "\n",
+    "First, we load the humaneval dataset. The dataset contains 164 examples. In each example, the \"prompt\" is the prompt string for eliciting the code generation (renamed into \"definition\"), \"test\" is the Python code for unit test for the example, and \"entry_point\" is the function name to be tested."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-24T23:25:36.931255Z",
+     "iopub.status.busy": "2023-02-24T23:25:36.930838Z",
+     "iopub.status.idle": "2023-02-24T23:25:39.148799Z",
+     "shell.execute_reply": "2023-02-24T23:25:39.148113Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset openai_humaneval (/home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1ca2f87c5d77475fb3d8478a92386c70",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75/cache-1e8448101c1b32e8.arrow\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets\n",
+    "\n",
+    "seed = 41\n",
+    "data = datasets.load_dataset(\"openai_humaneval\")[\"test\"].shuffle(seed=seed)\n",
+    "data = data.select(range(len(data))).rename_column(\"prompt\", \"definition\").remove_columns([\"task_id\", \"canonical_solution\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-24T23:25:39.164187Z",
+     "iopub.status.busy": "2023-02-24T23:25:39.163867Z",
+     "iopub.status.idle": "2023-02-24T23:25:39.169009Z",
+     "shell.execute_reply": "2023-02-24T23:25:39.168427Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from flaml.autogen.code_utils import success_metrics, implement\n",
+    "from flaml import oai"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "The `implement` function will first generate assertion statements for a problem. Then, it uses the assertions to select the generated responses."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-24T23:25:39.179030Z",
+     "iopub.status.busy": "2023-02-24T23:25:39.178624Z",
+     "iopub.status.idle": "2023-02-24T23:25:40.584410Z",
+     "shell.execute_reply": "2023-02-24T23:25:40.583802Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Example 0, config 1, success 1\n",
+      "Example 1, config 0, success 2\n",
+      "Example 2, config 0, success 3\n",
+      "Example 3, config 2, success 4\n",
+      "Example 4, config 2, success 5\n",
+      "Example 5, config 4, success 6\n",
+      "Example 6, config 4, success 6\n",
+      "Example 7, config 2, success 7\n",
+      "Example 8, config 2, success 8\n",
+      "Example 9, config 0, success 9\n",
+      "Example 10, config 1, success 10\n",
+      "Example 11, config 0, success 10\n",
+      "Example 12, config 2, success 11\n",
+      "Example 13, config 2, success 12\n",
+      "Example 14, config 0, success 13\n",
+      "Example 15, config 2, success 14\n",
+      "Example 16, config 0, success 15\n",
+      "Example 17, config 1, success 15\n",
+      "Example 18, config 1, success 16\n",
+      "Example 19, config 3, success 17\n",
+      "Example 20, config 2, success 18\n",
+      "Example 21, config 2, success 19\n",
+      "Example 22, config 2, success 19\n",
+      "Example 23, config 2, success 20\n",
+      "Example 24, config 0, success 21\n",
+      "Example 25, config 0, success 22\n",
+      "Example 26, config 4, success 23\n",
+      "Example 27, config 2, success 24\n",
+      "Example 28, config 4, success 24\n",
+      "Example 29, config 2, success 25\n",
+      "Example 30, config 2, success 26\n",
+      "Example 31, config 0, success 27\n",
+      "Example 32, config 0, success 28\n",
+      "Example 33, config 0, success 29\n",
+      "Example 34, config 2, success 30\n",
+      "Example 35, config 1, success 30\n",
+      "Example 36, config 0, success 31\n",
+      "Example 37, config 0, success 32\n",
+      "Example 38, config 0, success 33\n",
+      "Example 39, config 2, success 34\n",
+      "Example 40, config 0, success 35\n",
+      "Example 41, config 0, success 36\n",
+      "Example 42, config 3, success 37\n",
+      "Example 43, config 0, success 38\n",
+      "Example 44, config 2, success 39\n",
+      "Example 45, config 2, success 40\n",
+      "Example 46, config 2, success 40\n",
+      "Example 47, config 0, success 41\n",
+      "Example 48, config 3, success 42\n",
+      "Example 49, config 2, success 43\n",
+      "Example 50, config 1, success 44\n",
+      "Example 51, config 2, success 45\n",
+      "Example 52, config 3, success 46\n",
+      "Example 53, config 2, success 47\n",
+      "Example 54, config 0, success 48\n",
+      "Example 55, config 2, success 49\n",
+      "Example 56, config 2, success 50\n",
+      "Example 57, config 2, success 51\n",
+      "Example 58, config 0, success 52\n",
+      "Example 59, config 1, success 53\n",
+      "Example 60, config 0, success 53\n",
+      "Example 61, config 0, success 54\n",
+      "Example 62, config 1, success 55\n",
+      "Example 63, config 1, success 56\n",
+      "Example 64, config 0, success 57\n",
+      "Example 65, config 2, success 58\n",
+      "Example 66, config 2, success 59\n",
+      "Example 67, config 2, success 60\n",
+      "Example 68, config 2, success 61\n",
+      "Example 69, config 4, success 61\n",
+      "Example 70, config 2, success 62\n",
+      "Example 71, config 0, success 63\n",
+      "Example 72, config 0, success 64\n",
+      "Example 73, config 0, success 65\n",
+      "Example 74, config 0, success 66\n",
+      "Example 75, config 0, success 67\n",
+      "Example 76, config 1, success 68\n",
+      "Example 77, config 2, success 69\n",
+      "Example 78, config 1, success 70\n",
+      "Example 79, config 4, success 70\n",
+      "Example 80, config 2, success 71\n",
+      "Example 81, config 2, success 72\n",
+      "Example 82, config 0, success 72\n",
+      "Example 83, config 0, success 73\n",
+      "Example 84, config 4, success 73\n",
+      "Example 85, config 3, success 74\n",
+      "Example 86, config 0, success 75\n",
+      "Example 87, config 2, success 76\n",
+      "Example 88, config 2, success 77\n",
+      "Example 89, config 1, success 78\n",
+      "Example 90, config 0, success 79\n",
+      "Example 91, config 2, success 80\n",
+      "Example 92, config 1, success 81\n",
+      "Example 93, config 0, success 82\n",
+      "Example 94, config 0, success 83\n",
+      "Example 95, config 0, success 84\n",
+      "Example 96, config 2, success 85\n",
+      "Example 97, config 2, success 86\n",
+      "Example 98, config 2, success 87\n",
+      "Example 99, config 4, success 88\n",
+      "Example 100, config 0, success 89\n",
+      "Example 101, config 0, success 90\n",
+      "Example 102, config 2, success 91\n",
+      "Example 103, config 4, success 91\n",
+      "Example 104, config 2, success 92\n",
+      "Example 105, config 2, success 93\n",
+      "Example 106, config 4, success 93\n",
+      "Example 107, config 2, success 94\n",
+      "Example 108, config 0, success 95\n",
+      "Example 109, config 2, success 96\n",
+      "Example 110, config 0, success 97\n",
+      "Example 111, config 0, success 98\n",
+      "Example 112, config 2, success 99\n",
+      "Example 113, config 0, success 99\n",
+      "Example 114, config 2, success 100\n",
+      "Example 115, config 2, success 100\n",
+      "Example 116, config 0, success 101\n",
+      "Example 117, config 0, success 102\n",
+      "Example 118, config 0, success 103\n",
+      "Example 119, config 4, success 104\n",
+      "Example 120, config 2, success 105\n",
+      "Example 121, config 2, success 106\n",
+      "Example 122, config 0, success 107\n",
+      "Example 123, config 2, success 108\n",
+      "Example 124, config 1, success 109\n",
+      "Example 125, config 0, success 110\n",
+      "Example 126, config 1, success 111\n",
+      "Example 127, config 4, success 111\n",
+      "Example 128, config 2, success 112\n",
+      "Example 129, config 2, success 113\n",
+      "Example 130, config 0, success 114\n",
+      "Example 131, config 2, success 115\n",
+      "Example 132, config 0, success 116\n",
+      "Example 133, config 2, success 117\n",
+      "Example 134, config 1, success 118\n",
+      "Example 135, config 1, success 119\n",
+      "Example 136, config 0, success 120\n",
+      "Example 137, config 0, success 121\n",
+      "Example 138, config 2, success 122\n",
+      "Example 139, config 2, success 123\n",
+      "Example 140, config 2, success 124\n",
+      "Example 141, config 2, success 125\n",
+      "Example 142, config 2, success 126\n",
+      "Example 143, config 0, success 127\n",
+      "Example 144, config 0, success 128\n",
+      "Example 145, config 2, success 129\n",
+      "Example 146, config 1, success 130\n",
+      "Example 147, config 1, success 131\n",
+      "Example 148, config 2, success 132\n",
+      "Example 149, config 0, success 133\n",
+      "Example 150, config 0, success 134\n",
+      "Example 151, config 2, success 135\n",
+      "Example 152, config 0, success 136\n",
+      "Example 153, config 2, success 137\n",
+      "Example 154, config 2, success 138\n",
+      "Example 155, config 2, success 139\n",
+      "Example 156, config 0, success 140\n",
+      "Example 157, config 0, success 141\n",
+      "Example 158, config 4, success 142\n",
+      "Example 159, config 2, success 143\n",
+      "Example 160, config 0, success 144\n",
+      "Example 161, config 0, success 145\n",
+      "Example 162, config 0, success 146\n",
+      "Example 163, config 4, success 147\n",
+      "Success rate: 0.896\n",
+      "Average cost: 0.00025\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompt = \"# Python 3{definition}\"\n",
+    "stops = [[\"\\nclass\", \"\\ndef\", \"\\nif\", \"\\nprint\"], None]\n",
+    "configs = [{\"model\": 'gpt-3.5-turbo', \"prompt\": prompt, \"stop\": stops[1], \"temperature\": 0, \"seed\": 0}, {\"model\": 'gpt-3.5-turbo', \"prompt\": prompt, \"stop\": stops[0], \"n\": 7, \"seed\": 0}, {\"model\": 'gpt-4', \"prompt\": prompt, \"stop\": stops[1], \"temperature\": 0, \"seed\": 1}, {\"model\": 'gpt-4', \"prompt\": prompt, \"stop\": stops[0], \"n\": 2, \"seed\": 2}, {\"model\": 'gpt-4', \"prompt\": prompt, \"stop\": stops[0], \"n\": 1, \"seed\": 2}]\n",
+    "oai.Completion.set_cache(0)\n",
+    "oai.Completion.retry_timeout = 600\n",
+    "cost = 0\n",
+    "success = 0\n",
+    "for i, d in enumerate(data):\n",
+    "    response, cost, j = implement(d[\"definition\"], configs)\n",
+    "    metrics = success_metrics(responses=[response], **d)\n",
+    "    success += metrics[\"success\"]\n",
+    "    print(f\"Example {i}, config {j}, success {success}\")\n",
+    "print(f\"Success rate: {success / len(data):.3f}\")\n",
+    "print(f\"Average cost: {cost / len(data):.5f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
+   }
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "24dd93300e0442788ee6cc1310e5bf14": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "35cd066a31b242bb87b2c106ee72e5f2": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_8e7ee7687a99410d88a98a74ecfcea99",
+        "IPY_MODEL_421e02a11a974b40b3ddb75382b3b640",
+        "IPY_MODEL_77db9797e78b49438d21c5c8da34b4cb"
+       ],
+       "layout": "IPY_MODEL_47d3046236a54b0e8f9ae455a82c7e0b",
+       "tabbable": null,
+       "tooltip": null
+      }
+     },
+     "3d5d106a38954af2bb3bde5777702f4e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "3e1ebb31412443b0bca86a301cbdac11": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "421e02a11a974b40b3ddb75382b3b640": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_e6398d4027c9459a97965b9d91ae484f",
+       "max": 1,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_3e1ebb31412443b0bca86a301cbdac11",
+       "tabbable": null,
+       "tooltip": null,
+       "value": 1
+      }
+     },
+     "47d3046236a54b0e8f9ae455a82c7e0b": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "754800f7feb04acea977696e4787d1ff": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "77db9797e78b49438d21c5c8da34b4cb": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_7b6c4e1c11e249409a1edcd63be450d8",
+       "placeholder": "​",
+       "style": "IPY_MODEL_3d5d106a38954af2bb3bde5777702f4e",
+       "tabbable": null,
+       "tooltip": null,
+       "value": " 1/1 [00:00&lt;00:00, 44.40it/s]"
+      }
+     },
+     "7b6c4e1c11e249409a1edcd63be450d8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "8e7ee7687a99410d88a98a74ecfcea99": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_754800f7feb04acea977696e4787d1ff",
+       "placeholder": "​",
+       "style": "IPY_MODEL_24dd93300e0442788ee6cc1310e5bf14",
+       "tabbable": null,
+       "tooltip": null,
+       "value": "100%"
+      }
+     },
+     "e6398d4027c9459a97965b9d91ae484f": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test/openai/test_completion.py b/test/openai/test_completion.py
index afc738c073..78fc07355b 100644
--- a/test/openai/test_completion.py
+++ b/test/openai/test_completion.py
@@ -75,7 +75,7 @@ def test_humaneval(num_samples=1):
     )
     responses = oai.ChatCompletion.create(context=test_data[0], **config)
     print(responses)
-    code, cost = implement(tune_data[1], [config])
+    code, cost, _ = implement(tune_data[1], [config])
     print(code)
     print(cost)
     print(success_metrics([code], **tune_data[1]))

From d61f1fbf36afbdf6e015e153387253b78efea529 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Wed, 5 Apr 2023 23:09:56 +0000
Subject: [PATCH 13/27] GPT series

---
 README.md                       | 2 +-
 website/docs/Getting-Started.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index edc0dd1a6c..5fdb5e2a5e 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ FLAML is a lightweight Python library that finds accurate machine
 learning models automatically, efficiently and economically. It frees users from selecting
 models and hyperparameters for each model. It can also be used to tune generic hyperparameters for foundation models, MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on.
 
-1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as ChatGPT and GPT-4.
+1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as the GPT series.
 1. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code).
 1. It supports fast automatic tuning, capable of handling complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
 hyperparameter optimization](https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function/#hyperparameter-optimization-algorithm)
diff --git a/website/docs/Getting-Started.md b/website/docs/Getting-Started.md
index f8f3ac0069..67f97eeeaa 100644
--- a/website/docs/Getting-Started.md
+++ b/website/docs/Getting-Started.md
@@ -7,7 +7,7 @@ learning models automatically, efficiently and economically. It frees users from
 
 ### Main Features
 
-1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as ChatGPT and GPT-4.
+1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as the GPT series.
 2. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code). Users can customize only when and what they need to, and leave the rest to the library.
 3. It supports fast and economical automatic tuning, capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
 hyperparameter optimization](Use-Cases/Tune-User-Defined-Function#hyperparameter-optimization-algorithm)

From 1b91c1faeeb71630d49e03a5a2589c0afd8f80fd Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 04:04:08 +0000
Subject: [PATCH 14/27] text

---
 website/docs/Use-Cases/Auto-Generation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/Use-Cases/Auto-Generation.md b/website/docs/Use-Cases/Auto-Generation.md
index c901cd18ab..5fc10a21f9 100644
--- a/website/docs/Use-Cases/Auto-Generation.md
+++ b/website/docs/Use-Cases/Auto-Generation.md
@@ -104,7 +104,7 @@ One can use [`flaml.oai.Completion.create`](../reference/autogen/oai/completion#
 
 ```python
 response = oai.Completion.create(problme=problem, **config)
-responses = oai.Completion.extract_test(response)
+responses = oai.Completion.extract_text(response)
 # Extract a list of str responses
 ```
 

From 0024b8584c3b67565ff2ddb81ec2992663915d9e Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 04:13:26 +0000
Subject: [PATCH 15/27] no demo example

---
 notebook/autogen_chatgpt.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb
index 9f4bc68e6c..61d5914b4b 100644
--- a/notebook/autogen_chatgpt.ipynb
+++ b/notebook/autogen_chatgpt.ipynb
@@ -103,7 +103,7 @@
    "source": [
     "## Load dataset\n",
     "\n",
-    "First, we load the competition_math dataset. The dataset contains 201 \"Level 2\" Algebra examples. We use a random sample of 20 examples for tuning the generation hyperparameters and the remaining for evaluation. We use one demonstration example in the prompt."
+    "First, we load the competition_math dataset. The dataset contains 201 \"Level 2\" Algebra examples. We use a random sample of 20 examples for tuning the generation hyperparameters and the remaining for evaluation."
    ]
   },
   {

From 4d4db9ac73e1cb615f3623378a07abee9dda5b81 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 04:21:14 +0000
Subject: [PATCH 16/27] math

---
 notebook/research/math_level5counting.ipynb | 785 ++++++++++++++++++++
 1 file changed, 785 insertions(+)
 create mode 100644 notebook/research/math_level5counting.ipynb

diff --git a/notebook/research/math_level5counting.ipynb b/notebook/research/math_level5counting.ipynb
new file mode 100644
index 0000000000..162f7a392b
--- /dev/null
+++ b/notebook/research/math_level5counting.ipynb
@@ -0,0 +1,785 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+    "\n",
+    "Licensed under the MIT License.\n",
+    "\n",
+    "# Math Study\n",
+    "\n",
+    "In this notebook, we study GPT-4 for math problem solving. We use [the MATH benchmark](https://crfm.stanford.edu/helm/latest/?group=math_chain_of_thought) for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning. \n",
+    "\n",
+    "## Requirements\n",
+    "\n",
+    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
+    "```bash\n",
+    "pip install flaml[openai]==1.2.0\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.317406Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.316561Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.321193Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.320628Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install flaml[openai]==1.2.0 datasets"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set your OpenAI key:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.324240Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.323783Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.330570Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.329750Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "if \"OPENAI_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\"\n",
+    "# import openai\n",
+    "# openai.api_key_path = \"../key.txt\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Uncomment the following to use Azure OpenAI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.333547Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.333249Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.336508Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.335858Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# openai.api_type = \"azure\"\n",
+    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
+    "# openai.api_version = \"2023-03-15-preview\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load dataset\n",
+    "\n",
+    "First, we load the competition_math dataset. We use a random sample of 50 examples for testing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.339977Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.339556Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.603349Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.602630Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "\n",
+    "seed = 41\n",
+    "data = datasets.load_dataset(\"competition_math\")\n",
+    "train_data = data[\"train\"].shuffle(seed=seed)\n",
+    "test_data = data[\"test\"].shuffle(seed=seed)\n",
+    "n_tune_data = 20\n",
+    "tune_data = [\n",
+    "    {\n",
+    "        \"problem\": train_data[x][\"problem\"],\n",
+    "        \"solution\": train_data[x][\"solution\"],\n",
+    "    }\n",
+    "    for x in range(len(train_data)) if train_data[x][\"level\"] == \"Level 5\" and train_data[x][\"type\"] == \"Counting & Probability\"\n",
+    "][:n_tune_data]\n",
+    "test_data = [\n",
+    "    {\n",
+    "        \"problem\": test_data[x][\"problem\"],\n",
+    "        \"solution\": test_data[x][\"solution\"],\n",
+    "    }\n",
+    "    for x in range(len(test_data)) if test_data[x][\"level\"] == \"Level 5\" and test_data[x][\"type\"] == \"Counting & Probability\"\n",
+    "]\n",
+    "print(len(tune_data), len(test_data))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Check a tuning example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.607152Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.606441Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.610504Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.609759Z"
+    },
+    "slideshow": {
+     "slide_type": "subslide"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "print(tune_data[1][\"problem\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is one example of the canonical solution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.613590Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.613168Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.616873Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.616193Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(tune_data[1][\"solution\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Success Metric\n",
+    "\n",
+    "For each math task, we use voting to select a response with the most common answers out of all the generated responses. If it has an equivalent answer to the canonical solution, we consider the task as successfully solved. Then we can optimize the mean success rate of a collection of tasks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.626998Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.626593Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.631383Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.630770Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from flaml.autogen.math_utils import success_metrics"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "### Import the oai and tune subpackages from flaml.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.634335Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.633929Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.105700Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.105085Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from flaml import oai"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For (local) reproducibility and cost efficiency, we cache responses from OpenAI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.109177Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.108624Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.112651Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.112076Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "oai.ChatCompletion.set_cache(seed)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.115383Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.114975Z",
+     "iopub.status.idle": "2023-02-13T23:41:55.045654Z",
+     "shell.execute_reply": "2023-02-13T23:41:55.044973Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prompts = [\"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\"]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Evaluate the success rate on the test data\n",
+    "\n",
+    "You can use flaml's `oai.ChatCompletion.test` to evaluate the performance of an entire dataset with the tuned config."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "\n",
+    "config_n1 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 1}\n",
+    "n1_result = oai.ChatCompletion.test(test_data[:50], config_n1, success_metrics)\n",
+    "print(n1_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "oai.ChatCompletion.request_timeout = 120\n",
+    "config_n10 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 10}\n",
+    "n10_result = oai.ChatCompletion.test(test_data[:50], config_n10, success_metrics, logging_level=logging.INFO)\n",
+    "print(n10_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_n30 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 30}\n",
+    "n30_result = oai.ChatCompletion.test(test_data[:50], config_n30, success_metrics, logging_level=logging.INFO)\n",
+    "print(n30_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import defaultdict\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "prompts = [\"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\"]\n",
+    "markers = [\"o\", \"s\", \"D\", \"v\", \"p\", \"h\", \"d\", \"P\", \"X\", \"H\", \"8\", \"4\", \"3\", \"2\", \"1\", \"x\", \"+\", \">\", \"<\", \"^\", \"v\", \"1\", \"2\", \"3\", \"4\", \"8\", \"s\", \"p\", \"*\", \"h\", \"H\", \"d\", \"D\", \"|\", \"_\"]\n",
+    "for j, n in enumerate([10, 30]):\n",
+    "    config = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": n}\n",
+    "    metrics = []\n",
+    "    x, y = [], []\n",
+    "    votes_success = defaultdict(lambda: [0, 0])\n",
+    "    for i, data_i in enumerate(test_data[:50]):\n",
+    "        response = oai.ChatCompletion.create(context=data_i, **config)\n",
+    "        responses = oai.ChatCompletion.extract_text(response)\n",
+    "        metrics.append(success_metrics(responses, **data_i))\n",
+    "        votes = metrics[-1][\"votes\"]\n",
+    "        success = metrics[-1][\"success_vote\"]\n",
+    "        votes_success[votes][0] += 1\n",
+    "        votes_success[votes][1] += success\n",
+    "    for votes in votes_success:\n",
+    "        x.append(votes)\n",
+    "        y.append(votes_success[votes][1] / votes_success[votes][0])\n",
+    "\n",
+    "    plt.scatter(x, y, marker=markers[j])\n",
+    "    plt.xlabel(\"top vote\")\n",
+    "    plt.ylabel(\"success rate\")\n",
+    "plt.legend([\"n=10\", \"n=30\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
+   }
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "2d910cfd2d2a4fc49fc30fbbdc5576a7": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "454146d0f7224f038689031002906e6f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26",
+        "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755",
+        "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45"
+       ],
+       "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555",
+       "tabbable": null,
+       "tooltip": null
+      }
+     },
+     "577e1e3cc4db4942b0883577b3b52755": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7",
+       "max": 1,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62",
+       "tabbable": null,
+       "tooltip": null,
+       "value": 1
+      }
+     },
+     "6086462a12d54bafa59d3c4566f06cb2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "74a6ba0c3cbc4051be0a83e152fe1e62": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "7d3f3d9e15894d05a4d188ff4f466554": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "b40bdfb1ac1d4cffb7cefcb870c64d45": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8",
+       "placeholder": "​",
+       "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466",
+       "tabbable": null,
+       "tooltip": null,
+       "value": " 1/1 [00:00&lt;00:00, 44.69it/s]"
+      }
+     },
+     "ca245376fd9f4354af6b2befe4af4466": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "dc83c7bff2f241309537a8119dfc7555": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "e4ae2b6f5a974fd4bafb6abb9d12ff26": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2",
+       "placeholder": "​",
+       "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554",
+       "tabbable": null,
+       "tooltip": null,
+       "value": "100%"
+      }
+     },
+     "f1355871cc6f4dd4b50d9df5af20e5c8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From ab0c90e422829672f5a0760198e211152f838335 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 05:14:36 +0000
Subject: [PATCH 17/27] import openai

---
 notebook/autogen_chatgpt.ipynb          | 1 +
 notebook/autogen_openai.ipynb           | 1 +
 website/docs/Examples/AutoGen-OpenAI.md | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb
index 61d5914b4b..702b79620f 100644
--- a/notebook/autogen_chatgpt.ipynb
+++ b/notebook/autogen_chatgpt.ipynb
@@ -91,6 +91,7 @@
    },
    "outputs": [],
    "source": [
+    "# import openai\n",
     "# openai.api_type = \"azure\"\n",
     "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
     "# openai.api_version = \"2023-03-15-preview\""
diff --git a/notebook/autogen_openai.ipynb b/notebook/autogen_openai.ipynb
index 8e5e0d4f38..5b8fc95e63 100644
--- a/notebook/autogen_openai.ipynb
+++ b/notebook/autogen_openai.ipynb
@@ -91,6 +91,7 @@
    },
    "outputs": [],
    "source": [
+    "# import openai\n",
     "# openai.api_type = \"azure\"\n",
     "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
     "# openai.api_version = \"2023-03-15-preview\"  # change if necessary"
diff --git a/website/docs/Examples/AutoGen-OpenAI.md b/website/docs/Examples/AutoGen-OpenAI.md
index 7419718b18..19ee7c61fe 100644
--- a/website/docs/Examples/AutoGen-OpenAI.md
+++ b/website/docs/Examples/AutoGen-OpenAI.md
@@ -21,6 +21,8 @@ if "OPENAI_API_KEY" not in os.environ:
 If you use Azure OpenAI, set up Azure using the following code:
 
 ```python
+import openai
+
 openai.api_type = "azure"
 openai.api_base = "https://<your_endpoint>.openai.azure.com/"
 openai.api_version = "2023-03-15-preview"  # change if necessary

From 9976da6399ea36776934cbacb4fbd7b59c104e4a Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 17:46:03 +0000
Subject: [PATCH 18/27] import openai

---
 notebook/research/autogen_code.ipynb        | 1 +
 notebook/research/math_level5counting.ipynb | 1 +
 2 files changed, 2 insertions(+)

diff --git a/notebook/research/autogen_code.ipynb b/notebook/research/autogen_code.ipynb
index 9b0eadfb35..5fd40ea8b4 100644
--- a/notebook/research/autogen_code.ipynb
+++ b/notebook/research/autogen_code.ipynb
@@ -89,6 +89,7 @@
    },
    "outputs": [],
    "source": [
+    "# import openai\n",
     "# openai.api_type = \"azure\"\n",
     "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
     "# openai.api_version = \"2023-03-15-preview\"  # change if necessary"
diff --git a/notebook/research/math_level5counting.ipynb b/notebook/research/math_level5counting.ipynb
index 162f7a392b..ed7aaf45b4 100644
--- a/notebook/research/math_level5counting.ipynb
+++ b/notebook/research/math_level5counting.ipynb
@@ -91,6 +91,7 @@
    },
    "outputs": [],
    "source": [
+    "# import openai\n",
     "# openai.api_type = \"azure\"\n",
     "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
     "# openai.api_version = \"2023-03-15-preview\""

From a292e168d1866033ef8209e9f19bfd51d0b68271 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 18:06:05 +0000
Subject: [PATCH 19/27] azure model name

---
 flaml/autogen/oai/completion.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index dbed44bb66..4026761e35 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -58,6 +58,7 @@ class Completion:
     chat_models = {
         "gpt-3.5-turbo",
         "gpt-3.5-turbo-0301",
+        "chatgpt-35-turbo-0301",
         "gpt-4",
         "gpt-4-32k",
         "gpt-4-32k-0314",
@@ -75,6 +76,7 @@ class Completion:
         "text-davinci-003": 0.02,
         "gpt-3.5-turbo": 0.002,
         "gpt-3.5-turbo-0301": 0.002,
+        "chatgpt-35-turbo-0301": 0.002,
         "gpt-4": (0.03, 0.06),
         "gpt-4-0314": (0.03, 0.06),
         "gpt-4-32k": (0.06, 0.12),
@@ -910,15 +912,6 @@ def extract_text(cls, response: dict) -> List[str]:
 class ChatCompletion(Completion):
     """A class for OpenAI API ChatCompletion."""
 
-    price1K = {
-        "gpt-3.5-turbo": 0.002,
-        "gpt-3.5-turbo-0301": 0.002,
-        "gpt-4": (0.03, 0.06),
-        "gpt-4-0314": (0.03, 0.06),
-        "gpt-4-32k": (0.06, 0.12),
-        "gpt-4-32k-0314": (0.06, 0.12),
-    }
-
     default_search_space = Completion.default_search_space.copy()
     default_search_space["model"] = tune.choice(["gpt-3.5-turbo", "gpt-4"])
     openai_completion_class = not ERROR and openai.ChatCompletion

From 1da255ed5b0a62073dfc92bcf47ecea38bff8e05 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 19:19:45 +0000
Subject: [PATCH 20/27] azure model name

---
 flaml/autogen/oai/completion.py | 4 ++--
 notebook/autogen_chatgpt.ipynb  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index 4026761e35..88b3c4258b 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -58,7 +58,7 @@ class Completion:
     chat_models = {
         "gpt-3.5-turbo",
         "gpt-3.5-turbo-0301",
-        "chatgpt-35-turbo-0301",
+        "gpt-35-turbo",
         "gpt-4",
         "gpt-4-32k",
         "gpt-4-32k-0314",
@@ -76,7 +76,7 @@ class Completion:
         "text-davinci-003": 0.02,
         "gpt-3.5-turbo": 0.002,
         "gpt-3.5-turbo-0301": 0.002,
-        "chatgpt-35-turbo-0301": 0.002,
+        "gpt-35-turbo": 0.002,
         "gpt-4": (0.03, 0.06),
         "gpt-4-0314": (0.03, 0.06),
         "gpt-4-32k": (0.06, 0.12),
diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb
index 702b79620f..344ae50e9f 100644
--- a/notebook/autogen_chatgpt.ipynb
+++ b/notebook/autogen_chatgpt.ipynb
@@ -442,7 +442,7 @@
     "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
     "    # -1 means decided by the optimization budget only\n",
     "    num_samples=-1,\n",
-    "    # model=\"chatgpt-35-turbo-0301\",  # uncomment if using Azure OpenAI\n",
+    "    # model=\"gpt-35-turbo\",  # the gpt-3.5-turbo model name in Azure OpenAI\n",
     "    # model=\"gpt-3.5-turbo\",  # uncomment if you don't have access to gpt-4\n",
     "    prompt=prompts,  # the prompt templates to choose from\n",
     "    # stop=\"###\",  # the stop sequence\n",

From c71ecbcf4ff5f37aa46ee7ed84a43dcc058ec74f Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 6 Apr 2023 19:34:44 +0000
Subject: [PATCH 21/27] openai version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b576486606..60e6e8de20 100644
--- a/setup.py
+++ b/setup.py
@@ -120,7 +120,7 @@
             "pytorch-forecasting>=0.9.0",
         ],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
-        "openai": ["openai==0.27.0", "diskcache", "optuna==2.8.0"],
+        "openai": ["openai==0.27.4", "diskcache", "optuna==2.8.0"],
         "synapse": ["joblibspark>=0.5.0", "optuna==2.8.0", "pyspark>=3.2.0"],
     },
     classifiers=[

From 11b9a9ac6858c2a7b4720b38287add75f8d54768 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 7 Apr 2023 00:16:48 +0000
Subject: [PATCH 22/27] generate assertion if necessary

---
 flaml/autogen/code_utils.py     | 15 +++++++++++++--
 flaml/autogen/oai/completion.py |  4 +++-
 notebook/autogen_chatgpt.ipynb  |  1 -
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
index e62b170c2a..571d320d4b 100644
--- a/flaml/autogen/code_utils.py
+++ b/flaml/autogen/code_utils.py
@@ -129,6 +129,7 @@ def success_metrics(
             "index_selected": i,
             "succeed_assertions": succeed_assertions,
             "gen_cost": gen_cost,
+            "assertions": assertions,
         }
     code_test = (
         f"{response}\n{test}\ncheck({entry_point})"
@@ -141,26 +142,36 @@ def success_metrics(
         "succeed_assertions": succeed_assertions,
         "success": success,
         "gen_cost": gen_cost,
+        "assertions": assertions,
     }
 
 
-def implement(definition: str, configs: List[Dict]) -> Tuple[str, float]:
+def implement(
+    definition: str,
+    configs: List[Dict],
+    assertions: Optional[
+        Union[str, Callable[[str], Tuple[str, float]]]
+    ] = generate_assertions,
+) -> Tuple[str, float]:
     """Implement a function.
 
     Args:
         definition (str): The function definition, including the signature and docstr.
         configs (list): The list of configurations for completion.
+        assertions (Optional, str or Callable): The assertion code which serves as a filter of the responses, or an assertion generator.
 
     Returns:
         str: The implementation.
         float: The cost of the implementation.
         int: The index of the configuration which generates the implementation.
     """
-    assertions, cost = generate_assertions(definition)
+    cost = 0
     for i, config in enumerate(configs):
         response = oai.Completion.create({"definition": definition}, **config)
         cost += oai.Completion.cost(config["model"], response)
         responses = oai.Completion.extract_text(response)
         metrics = success_metrics(responses, definition, assertions=assertions)
+        assertions = metrics["assertions"]
+        cost += metrics["gen_cost"]
         if metrics["succeed_assertions"] or i == len(configs) - 1:
             return responses[metrics["index_selected"]], cost, i
diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index 88b3c4258b..9337b22df8 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -176,7 +176,9 @@ def _get_response(cls, config: dict, eval_only=False, use_cache=True):
                 if "azure" == openai.api_type and "model" in config:
                     # azure api uses "engine" instead of "model"
                     config = config.copy()
-                    config["engine"] = config.pop("model")
+                    config["engine"] = config.pop("model").replace(
+                        "gpt-3.5-turbo", "gpt-35-turbo"
+                    )
                 else:
                     raise
         logger.warning(
diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb
index 344ae50e9f..a60fab4e30 100644
--- a/notebook/autogen_chatgpt.ipynb
+++ b/notebook/autogen_chatgpt.ipynb
@@ -442,7 +442,6 @@
     "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
     "    # -1 means decided by the optimization budget only\n",
     "    num_samples=-1,\n",
-    "    # model=\"gpt-35-turbo\",  # the gpt-3.5-turbo model name in Azure OpenAI\n",
     "    # model=\"gpt-3.5-turbo\",  # uncomment if you don't have access to gpt-4\n",
     "    prompt=prompts,  # the prompt templates to choose from\n",
     "    # stop=\"###\",  # the stop sequence\n",

From d925ed83d4047f5266c708f5e338fa5e68123c75 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 7 Apr 2023 01:46:03 +0000
Subject: [PATCH 23/27] condition to generate assertions

---
 flaml/autogen/code_utils.py          |  2 ++
 notebook/research/autogen_code.ipynb | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
index 571d320d4b..68b7ef28de 100644
--- a/flaml/autogen/code_utils.py
+++ b/flaml/autogen/code_utils.py
@@ -166,6 +166,8 @@ def implement(
         int: The index of the configuration which generates the implementation.
     """
     cost = 0
+    if len(configs) > 1 and callable(assertions):
+        assertions, cost = assertions(definition)
     for i, config in enumerate(configs):
         response = oai.Completion.create({"definition": definition}, **config)
         cost += oai.Completion.cost(config["model"], response)
diff --git a/notebook/research/autogen_code.ipynb b/notebook/research/autogen_code.ipynb
index 5fd40ea8b4..dd47dd1c11 100644
--- a/notebook/research/autogen_code.ipynb
+++ b/notebook/research/autogen_code.ipynb
@@ -27,7 +27,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:36.910966Z",
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 2,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:36.917301Z",
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:36.925804Z",
@@ -107,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:36.931255Z",
@@ -127,7 +127,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1ca2f87c5d77475fb3d8478a92386c70",
+       "model_id": "1fdc8853bf2a4aecaa2cd024ad99b5a2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -156,7 +156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 5,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:39.164187Z",
@@ -185,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 6,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:39.179030Z",
@@ -367,7 +367,7 @@
       "Example 162, config 0, success 146\n",
       "Example 163, config 4, success 147\n",
       "Success rate: 0.896\n",
-      "Average cost: 0.00025\n"
+      "Average cost: 0.00818\n"
      ]
     }
    ],
@@ -380,9 +380,10 @@
     "cost = 0\n",
     "success = 0\n",
     "for i, d in enumerate(data):\n",
-    "    response, cost, j = implement(d[\"definition\"], configs)\n",
+    "    response, cost_i, j = implement(d[\"definition\"], configs)\n",
     "    metrics = success_metrics(responses=[response], **d)\n",
     "    success += metrics[\"success\"]\n",
+    "    cost += cost_i\n",
     "    print(f\"Example {i}, config {j}, success {success}\")\n",
     "print(f\"Success rate: {success / len(data):.3f}\")\n",
     "print(f\"Average cost: {cost / len(data):.5f}\")"

From ab0ee96dd1fef2aa6b48f1be90667bff1cae88ff Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 7 Apr 2023 13:53:09 +0000
Subject: [PATCH 24/27] init region key

---
 flaml/autogen/oai/completion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index 9337b22df8..1d755d5c56 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -323,6 +323,7 @@ def _eval(cls, config: dict, prune=True, eval_only=False):
                 start_n = max_valid_n + 1
         else:
             start_n = config_n
+            region_key = None
         params = config.copy()
         if "stop" in config:
             params["stop"] = stop

From ff8126babd295c9803ef72282acfa0513469d9d8 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 7 Apr 2023 16:18:20 +0000
Subject: [PATCH 25/27] rename

---
 README.md                                   |  2 +-
 flaml/autogen/code_utils.py                 | 10 +--
 flaml/autogen/math_utils.py                 | 13 ++--
 notebook/autogen_chatgpt.ipynb              | 14 ++--
 notebook/autogen_openai.ipynb               | 72 +++++++++------------
 notebook/research/autogen_code.ipynb        |  4 +-
 notebook/research/math_level5counting.ipynb | 14 ++--
 test/openai/test_completion.py              | 45 +++++++------
 website/docs/Examples/AutoGen-OpenAI.md     |  8 +--
 website/docs/Getting-Started.md             |  2 +-
 website/docs/Use-Cases/Auto-Generation.md   |  2 +-
 11 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/README.md b/README.md
index 5fdb5e2a5e..7e199eb2eb 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ config, analysis = oai.Completion.tune(
     data=tune_data,
     metric="success",
     mode="max",
-    eval_func=success_metrics,
+    eval_func=eval_func,
     inference_budget=0.05,
     optimization_budget=3,
     num_samples=-1,
diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
index 68b7ef28de..abf9a10307 100644
--- a/flaml/autogen/code_utils.py
+++ b/flaml/autogen/code_utils.py
@@ -66,14 +66,14 @@ def _remove_check(response):
     return response[:pos]
 
 
-def success_metrics(
+def eval_function_completions(
     responses: List[str],
     definition: str,
     test: Optional[str] = None,
     entry_point: Optional[str] = None,
     assertions: Optional[Union[str, Callable[[str], Tuple[str, float]]]] = None,
 ) -> Dict:
-    """Check if the task is successful.
+    """Select a response from a list of responses for the function completion task (using generated assertions), and/or evaluate if the task is successful using a gold test.
 
     Args:
         responses (list): The list of responses.
@@ -153,7 +153,7 @@ def implement(
         Union[str, Callable[[str], Tuple[str, float]]]
     ] = generate_assertions,
 ) -> Tuple[str, float]:
-    """Implement a function.
+    """Implement a function from a definition.
 
     Args:
         definition (str): The function definition, including the signature and docstr.
@@ -172,7 +172,9 @@ def implement(
         response = oai.Completion.create({"definition": definition}, **config)
         cost += oai.Completion.cost(config["model"], response)
         responses = oai.Completion.extract_text(response)
-        metrics = success_metrics(responses, definition, assertions=assertions)
+        metrics = eval_function_completions(
+            responses, definition, assertions=assertions
+        )
         assertions = metrics["assertions"]
         cost += metrics["gen_cost"]
         if metrics["succeed_assertions"] or i == len(configs) - 1:
diff --git a/flaml/autogen/math_utils.py b/flaml/autogen/math_utils.py
index 76ba52d979..a16b05c0c2 100644
--- a/flaml/autogen/math_utils.py
+++ b/flaml/autogen/math_utils.py
@@ -280,8 +280,8 @@ def voting_counts(responses):
     return answers
 
 
-def success_metrics(responses, solution, **args):
-    """Check if each response is correct.
+def eval_math_responses(responses, solution=None, **args):
+    """Select a response for a math problem using voting, and check if the response is correct if the solution is provided.
 
     Args:
         responses (list): The list of responses.
@@ -292,10 +292,11 @@ def success_metrics(responses, solution, **args):
     """
     success_list = []
     n = len(responses)
-    for i in range(n):
-        response = responses[i]
-        succeed = is_equiv_chain_of_thought(response, solution)
-        success_list.append(succeed)
+    if solution is not None:
+        for i in range(n):
+            response = responses[i]
+            succeed = is_equiv_chain_of_thought(response, solution)
+            success_list.append(succeed)
     # voting
     answers = voting_counts(responses)
     # find the answer with highest votes in answers
diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb
index a60fab4e30..dcf4714688 100644
--- a/notebook/autogen_chatgpt.ipynb
+++ b/notebook/autogen_chatgpt.ipynb
@@ -277,7 +277,7 @@
    },
    "outputs": [],
    "source": [
-    "from flaml.autogen.math_utils import success_metrics"
+    "from flaml.autogen.math_utils import eval_math_responses"
    ]
   },
   {
@@ -435,7 +435,7 @@
     "    data=tune_data,  # the data for tuning\n",
     "    metric=\"success_vote\",  # the metric to optimize\n",
     "    mode=\"max\",  # the optimization mode\n",
-    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
+    "    eval_func=eval_math_responses,  # the evaluation function to return the success metrics\n",
     "    # log_file_name=\"logs/math.log\",  # the log file name\n",
     "    inference_budget=0.02,  # the inference budget (dollar)\n",
     "    optimization_budget=1,  # the optimization budget (dollar)\n",
@@ -970,7 +970,7 @@
    ],
    "source": [
     "response = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
-    "metric_results = success_metrics(oai.ChatCompletion.extract_text(response), **tune_data[1])\n",
+    "metric_results = eval_math_responses(oai.ChatCompletion.extract_text(response), **tune_data[1])\n",
     "print(\"response on an example data instance:\", response)\n",
     "print(\"metric_results on the example data instance:\", metric_results)\n"
    ]
@@ -1006,7 +1006,7 @@
     }
    ],
    "source": [
-    "# result = oai.Completion.test(test_data, config, success_metrics)\n",
+    "# result = oai.Completion.test(test_data, config, eval_math_responses)\n",
     "# print(\"performance on test data with the tuned config:\", result)"
    ]
   },
@@ -1036,7 +1036,7 @@
     "# the following code will cost roughly $2 if uncommented and run.\n",
     "\n",
     "# default_config = {\"model\": 'gpt-4', \"prompt\": prompts[0]}\n",
-    "# default_result = oai.Completion.test(test_data, default_config, success_metrics)\n",
+    "# default_result = oai.Completion.test(test_data, default_config, eval_math_responses)\n",
     "# print(\"performance on test data from gpt-4 with a default config:\", default_result)"
    ]
   },
@@ -1084,7 +1084,7 @@
     "# The following evaluation costs $3 and longer than one hour if you uncomment it and run it.\n",
     "\n",
     "# config_n2 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"n\": 2}\n",
-    "# result_n2 = oai.ChatCompletion.test(test_data, config_n2, success_metrics)\n",
+    "# result_n2 = oai.ChatCompletion.test(test_data, config_n2, eval_math_responses)\n",
     "# print(\"performance on test data from gpt-4 with a default config and n=2:\", result_n2)\n"
    ]
   },
@@ -1113,7 +1113,7 @@
     "# The following evaluation costs $8 and longer than one hour if you uncomment it and run it.\n",
     "\n",
     "# config_n5 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"n\": 5}\n",
-    "# result_n5 = oai.ChatCompletion.test(test_data, config_n5, success_metrics)\n",
+    "# result_n5 = oai.ChatCompletion.test(test_data, config_n5, eval_math_responses)\n",
     "# print(\"performance on test data from gpt-4 with a default config and n=5:\", result_n5)"
    ]
   },
diff --git a/notebook/autogen_openai.ipynb b/notebook/autogen_openai.ipynb
index 5b8fc95e63..46e507c4db 100644
--- a/notebook/autogen_openai.ipynb
+++ b/notebook/autogen_openai.ipynb
@@ -67,7 +67,7 @@
     "import os\n",
     "\n",
     "if \"OPENAI_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\"\n"
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
    ]
   },
   {
@@ -129,7 +129,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f70bc8cc932c4c5aa9e9e9fcfd598078",
+       "model_id": "0be40d7ad7f049f1946bd69b0c570f33",
        "version_major": 2,
        "version_minor": 0
       },
@@ -295,9 +295,9 @@
    "outputs": [],
    "source": [
     "from functools import partial\n",
-    "from flaml.autogen.code_utils import success_metrics, generate_assertions\n",
+    "from flaml.autogen.code_utils import eval_function_completions, generate_assertions\n",
     "\n",
-    "success_metrics_with_generated_assertions = partial(success_metrics, assertions=generate_assertions)"
+    "eval_with_generated_assertions = partial(eval_function_completions, assertions=generate_assertions)"
    ]
   },
   {
@@ -422,35 +422,35 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m[I 2023-04-02 21:07:15,735]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
-      "\u001b[32m[I 2023-04-02 21:07:15,738]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+      "\u001b[32m[I 2023-04-07 15:55:46,931]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
+      "\u001b[32m[I 2023-04-07 15:55:46,934]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[flaml.tune.tune: 04-02 21:07:15] {832} INFO - trial 1 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-02 21:07:32] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'total_cost': 0.010323600000000004, 'cost': 0.010323600000000004, 'inference_cost': 0.00022578, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.89126968383789}\n",
-      "[flaml.tune.tune: 04-02 21:07:32] {832} INFO - trial 2 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-02 21:07:49] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'total_cost': 0.03038410000000001, 'cost': 0.020060500000000002, 'inference_cost': 0.001003025, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.967792749404907}\n",
-      "[flaml.tune.tune: 04-02 21:07:49] {832} INFO - trial 3 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-02 21:07:53] {215} INFO - result: {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 3.70871901512146}\n",
-      "[flaml.tune.tune: 04-02 21:07:53] {832} INFO - trial 4 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-02 21:08:03] {215} INFO - result: {'index_selected': 13.85, 'succeed_assertions': 0.55, 'success': 0.5, 'gen_cost': 0.00046369999999999994, 'total_cost': 0.9526220999999998, 'cost': 0.065458, 'inference_cost': 0.0033335, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 9.844265460968018}\n",
-      "[flaml.tune.tune: 04-02 21:08:03] {832} INFO - trial 5 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-02 21:08:03] {215} INFO - result: {'success': 0, 'total_cost': 1.1518221, 'cost': 0.1992, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 0.0029821395874023438}\n",
-      "[flaml.tune.tune: 04-02 21:08:03] {832} INFO - trial 6 config: {'prompt': 0, 'stop': 0, 'subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}}\n",
-      "[flaml.tune.tune: 04-02 21:08:03] {215} INFO - result: {'success': 0, 'total_cost': 1.2810420999999999, 'cost': 0.12922, 'training_iteration': 0, 'config': {'prompt': 0, 'stop': 0, 'subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}}, 'config/prompt': 0, 'config/stop': 0, 'config/subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}, 'experiment_tag': 'exp', 'time_total_s': 0.0020961761474609375}\n",
-      "[flaml.tune.tune: 04-02 21:08:03] {832} INFO - trial 7 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}}\n",
-      "[flaml.tune.tune: 04-02 21:08:06] {215} INFO - result: {'index_selected': 3.1, 'succeed_assertions': 0.9, 'success': 0.7, 'gen_cost': 0.00046369999999999994, 'total_cost': 2.0943421, 'cost': 0.8133, 'inference_cost': 0.038592, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}, 'experiment_tag': 'exp', 'time_total_s': 3.2131495475769043}\n",
-      "[flaml.tune.tune: 04-02 21:08:06] {832} INFO - trial 8 config: {'prompt': 1, 'stop': 0, 'subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}}\n",
-      "[flaml.tune.tune: 04-02 21:08:27] {215} INFO - result: {'index_selected': 31.1, 'succeed_assertions': 0.55, 'success': 0.45, 'gen_cost': 0.00046369999999999994, 'total_cost': 2.2339381000000005, 'cost': 0.139596, 'inference_cost': 0.0070404, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}, 'experiment_tag': 'exp', 'time_total_s': 20.68828248977661}\n",
-      "[flaml.tune.tune: 04-02 21:08:27] {832} INFO - trial 9 config: {'prompt': 2, 'stop': 0, 'subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}}\n",
-      "[flaml.tune.tune: 04-02 21:08:27] {215} INFO - result: {'success': 0, 'total_cost': 2.3659581000000007, 'cost': 0.13202000000000003, 'training_iteration': 0, 'config': {'prompt': 2, 'stop': 0, 'subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}}, 'config/prompt': 2, 'config/stop': 0, 'config/subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}, 'experiment_tag': 'exp', 'time_total_s': 0.002319812774658203}\n",
-      "[flaml.tune.tune: 04-02 21:08:27] {832} INFO - trial 10 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}}\n",
-      "[flaml.tune.tune: 04-02 21:08:27] {215} INFO - result: {'success': 0, 'total_cost': 3.0324381, 'cost': 0.66648, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}, 'experiment_tag': 'exp', 'time_total_s': 0.006746053695678711}\n",
-      "[flaml.tune.tune: 04-02 21:08:27] {855} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
+      "[flaml.tune.tune: 04-07 15:55:46] {832} INFO - trial 1 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 15:56:03] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.010323600000000004, 'cost': 0.010323600000000004, 'inference_cost': 0.00022578, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.854185342788696}\n",
+      "[flaml.tune.tune: 04-07 15:56:03] {832} INFO - trial 2 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 15:56:20] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.03038410000000001, 'cost': 0.020060500000000002, 'inference_cost': 0.001003025, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 17.066588163375854}\n",
+      "[flaml.tune.tune: 04-07 15:56:20] {832} INFO - trial 3 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 15:56:25] {215} INFO - result: {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 4.1644368171691895}\n",
+      "[flaml.tune.tune: 04-07 15:56:25] {832} INFO - trial 4 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 15:56:34] {215} INFO - result: {'index_selected': 13.85, 'succeed_assertions': 0.55, 'success': 0.5, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.9526220999999998, 'cost': 0.065458, 'inference_cost': 0.0033335, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 9.954304933547974}\n",
+      "[flaml.tune.tune: 04-07 15:56:34] {832} INFO - trial 5 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 15:56:34] {215} INFO - result: {'success': 0, 'total_cost': 1.1518221, 'cost': 0.1992, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 0.003031015396118164}\n",
+      "[flaml.tune.tune: 04-07 15:56:34] {832} INFO - trial 6 config: {'prompt': 0, 'stop': 0, 'subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}}\n",
+      "[flaml.tune.tune: 04-07 15:56:34] {215} INFO - result: {'success': 0, 'total_cost': 1.2810420999999999, 'cost': 0.12922, 'training_iteration': 0, 'config': {'prompt': 0, 'stop': 0, 'subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}}, 'config/prompt': 0, 'config/stop': 0, 'config/subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}, 'experiment_tag': 'exp', 'time_total_s': 0.0019817352294921875}\n",
+      "[flaml.tune.tune: 04-07 15:56:34] {832} INFO - trial 7 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}}\n",
+      "[flaml.tune.tune: 04-07 15:56:38] {215} INFO - result: {'index_selected': 3.1, 'succeed_assertions': 0.9, 'success': 0.7, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 2.0943421, 'cost': 0.8133, 'inference_cost': 0.038592, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}, 'experiment_tag': 'exp', 'time_total_s': 3.2032766342163086}\n",
+      "[flaml.tune.tune: 04-07 15:56:38] {832} INFO - trial 8 config: {'prompt': 1, 'stop': 0, 'subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}}\n",
+      "[flaml.tune.tune: 04-07 15:56:58] {215} INFO - result: {'index_selected': 31.1, 'succeed_assertions': 0.55, 'success': 0.45, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 2.2339381000000005, 'cost': 0.139596, 'inference_cost': 0.0070404, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}, 'experiment_tag': 'exp', 'time_total_s': 20.776247024536133}\n",
+      "[flaml.tune.tune: 04-07 15:56:58] {832} INFO - trial 9 config: {'prompt': 2, 'stop': 0, 'subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}}\n",
+      "[flaml.tune.tune: 04-07 15:56:58] {215} INFO - result: {'success': 0, 'total_cost': 2.3659581000000007, 'cost': 0.13202000000000003, 'training_iteration': 0, 'config': {'prompt': 2, 'stop': 0, 'subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}}, 'config/prompt': 2, 'config/stop': 0, 'config/subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}, 'experiment_tag': 'exp', 'time_total_s': 0.0025119781494140625}\n",
+      "[flaml.tune.tune: 04-07 15:56:58] {832} INFO - trial 10 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}}\n",
+      "[flaml.tune.tune: 04-07 15:56:58] {215} INFO - result: {'success': 0, 'total_cost': 3.0324381, 'cost': 0.66648, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}, 'experiment_tag': 'exp', 'time_total_s': 0.006719112396240234}\n",
+      "[flaml.tune.tune: 04-07 15:56:58] {855} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
      ]
     }
    ],
@@ -459,7 +459,7 @@
     "    data=tune_data,  # the data for tuning\n",
     "    metric=\"success\",  # the metric to optimize\n",
     "    mode=\"max\",  # the optimization mode\n",
-    "    eval_func=success_metrics_with_generated_assertions,  # the evaluation function to return the success metrics\n",
+    "    eval_func=eval_with_generated_assertions,  # the evaluation function to return the success metrics\n",
     "    # log_file_name=\"logs/humaneval.log\",  # the log file name\n",
     "    inference_budget=0.05,  # the inference budget (dollar)\n",
     "    optimization_budget=3,  # the optimization budget (dollar)\n",
@@ -502,7 +502,7 @@
      "output_type": "stream",
      "text": [
       "optimized config {'prompt': '# Python 3{definition}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'model': 'text-davinci-003', 'max_tokens': 148, 'n': 27, 'top_p': 0.755486898036596}\n",
-      "best result on tuning data {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 3.70871901512146}\n"
+      "best result on tuning data {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 4.1644368171691895}\n"
      ]
     }
    ],
@@ -720,14 +720,14 @@
       "    \"total_tokens\": 1441\n",
       "  }\n",
       "}\n",
-      "{'expected_success': 1.0, 'success': True}\n"
+      "{'index_selected': 0, 'succeed_assertions': 1, 'success': 1, 'gen_cost': 0.000702, 'assertions': 'assert compare([1,2,3,4,5,1],[1,2,3,4,2,-2]) == [0,0,0,0,3,3]\\nassert compare([0,5,0,0,0,4],[4,1,1,0,0,-2]) == [4,4,1,0,0,6]'}\n"
      ]
     }
    ],
    "source": [
     "response = oai.Completion.create(context=tune_data[1], **config)\n",
     "print(response)\n",
-    "print(success_metrics(oai.Completion.extract_text(response), **tune_data[1]))\n"
+    "print(eval_with_generated_assertions(oai.Completion.extract_text(response), **tune_data[1]))\n"
    ]
   },
   {
@@ -751,15 +751,7 @@
      "shell.execute_reply": "2023-02-24T23:29:27.159519Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "performance on test data with the tuned config: {'index_selected': 5.208333333333333, 'succeed_assertions': 0.8402777777777778, 'success': 0.7777777777777778, 'gen_cost': 0.00045375000000000005, 'cost': 5.785519999999999, 'inference_cost': 0.04017722222222222}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# result = oai.Completion.test(test_data, config)\n",
     "# print(\"performance on test data with the tuned config:\", result)"
diff --git a/notebook/research/autogen_code.ipynb b/notebook/research/autogen_code.ipynb
index dd47dd1c11..a796761eb2 100644
--- a/notebook/research/autogen_code.ipynb
+++ b/notebook/research/autogen_code.ipynb
@@ -167,7 +167,7 @@
    },
    "outputs": [],
    "source": [
-    "from flaml.autogen.code_utils import success_metrics, implement\n",
+    "from flaml.autogen.code_utils import eval_function_completions, implement\n",
     "from flaml import oai"
    ]
   },
@@ -381,7 +381,7 @@
     "success = 0\n",
     "for i, d in enumerate(data):\n",
     "    response, cost_i, j = implement(d[\"definition\"], configs)\n",
-    "    metrics = success_metrics(responses=[response], **d)\n",
+    "    metrics = eval_function_completions(responses=[response], **d)\n",
     "    success += metrics[\"success\"]\n",
     "    cost += cost_i\n",
     "    print(f\"Example {i}, config {j}, success {success}\")\n",
diff --git a/notebook/research/math_level5counting.ipynb b/notebook/research/math_level5counting.ipynb
index ed7aaf45b4..00e306224e 100644
--- a/notebook/research/math_level5counting.ipynb
+++ b/notebook/research/math_level5counting.ipynb
@@ -65,9 +65,7 @@
     "import os\n",
     "\n",
     "if \"OPENAI_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\"\n",
-    "# import openai\n",
-    "# openai.api_key_path = \"../key.txt\""
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
    ]
   },
   {
@@ -223,7 +221,7 @@
    },
    "outputs": [],
    "source": [
-    "from flaml.autogen.math_utils import success_metrics"
+    "from flaml.autogen.math_utils import eval_math_responses"
    ]
   },
   {
@@ -327,7 +325,7 @@
     "import logging\n",
     "\n",
     "config_n1 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 1}\n",
-    "n1_result = oai.ChatCompletion.test(test_data[:50], config_n1, success_metrics)\n",
+    "n1_result = oai.ChatCompletion.test(test_data[:50], config_n1, eval_math_responses)\n",
     "print(n1_result)"
    ]
   },
@@ -339,7 +337,7 @@
    "source": [
     "oai.ChatCompletion.request_timeout = 120\n",
     "config_n10 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 10}\n",
-    "n10_result = oai.ChatCompletion.test(test_data[:50], config_n10, success_metrics, logging_level=logging.INFO)\n",
+    "n10_result = oai.ChatCompletion.test(test_data[:50], config_n10, eval_math_responses, logging_level=logging.INFO)\n",
     "print(n10_result)"
    ]
   },
@@ -350,7 +348,7 @@
    "outputs": [],
    "source": [
     "config_n30 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 30}\n",
-    "n30_result = oai.ChatCompletion.test(test_data[:50], config_n30, success_metrics, logging_level=logging.INFO)\n",
+    "n30_result = oai.ChatCompletion.test(test_data[:50], config_n30, eval_math_responses, logging_level=logging.INFO)\n",
     "print(n30_result)"
    ]
   },
@@ -373,7 +371,7 @@
     "    for i, data_i in enumerate(test_data[:50]):\n",
     "        response = oai.ChatCompletion.create(context=data_i, **config)\n",
     "        responses = oai.ChatCompletion.extract_text(response)\n",
-    "        metrics.append(success_metrics(responses, **data_i))\n",
+    "        metrics.append(eval_math_responses(responses, **data_i))\n",
     "        votes = metrics[-1][\"votes\"]\n",
     "        success = metrics[-1][\"success_vote\"]\n",
     "        votes_success[votes][0] += 1\n",
diff --git a/test/openai/test_completion.py b/test/openai/test_completion.py
index 78fc07355b..1d04ab4435 100644
--- a/test/openai/test_completion.py
+++ b/test/openai/test_completion.py
@@ -4,8 +4,12 @@
 import pytest
 from functools import partial
 from flaml import oai
-from flaml.autogen.code_utils import success_metrics, generate_assertions, implement
-from flaml.autogen.math_utils import success_metrics as math_success_metrics
+from flaml.autogen.code_utils import (
+    eval_function_completions,
+    generate_assertions,
+    implement,
+)
+from flaml.autogen.math_utils import eval_math_responses
 
 
 @pytest.mark.skipif(
@@ -13,8 +17,8 @@
     reason="do not run on windows",
 )
 def test_humaneval(num_samples=1):
-    success_metrics_with_generated_assertions = partial(
-        success_metrics, assertions=generate_assertions
+    eval_with_generated_assertions = partial(
+        eval_function_completions, assertions=generate_assertions
     )
 
     seed = 41
@@ -48,7 +52,7 @@ def test_humaneval(num_samples=1):
         data=tune_data,
         metric="success",
         mode="max",
-        eval_func=success_metrics,
+        eval_func=eval_function_completions,
         n=1,
         prompt="{definition}",
     )
@@ -58,7 +62,7 @@ def test_humaneval(num_samples=1):
         data=tune_data,
         metric="succeed_assertions",
         mode="max",
-        eval_func=success_metrics_with_generated_assertions,
+        eval_func=eval_with_generated_assertions,
         n=1,
         model="gpt-3.5-turbo",
         prompt="{definition}",
@@ -69,7 +73,7 @@ def test_humaneval(num_samples=1):
         data=tune_data,
         metric="expected_success",
         mode="max",
-        eval_func=success_metrics,
+        eval_func=eval_function_completions,
         n=1,
         messages=[{"role": "user", "content": "{definition}"}],
     )
@@ -78,13 +82,13 @@ def test_humaneval(num_samples=1):
     code, cost, _ = implement(tune_data[1], [config])
     print(code)
     print(cost)
-    print(success_metrics([code], **tune_data[1]))
+    print(eval_function_completions([code], **tune_data[1]))
     # a more comprehensive tuning example
-    config, analysis = oai.Completion.tune(
+    config2, analysis = oai.Completion.tune(
         data=tune_data,
         metric="success",
         mode="max",
-        eval_func=success_metrics_with_generated_assertions,
+        eval_func=eval_with_generated_assertions,
         log_file_name="logs/humaneval.log",
         inference_budget=0.002,
         optimization_budget=2,
@@ -96,16 +100,19 @@ def test_humaneval(num_samples=1):
         ],
         stop=[["\nclass", "\ndef", "\nif", "\nprint"], None],  # the stop sequences
     )
-    print(config)
+    print(config2)
     print(analysis.best_result)
     print(test_data[0])
-    responses = oai.Completion.create(context=test_data[0], **config)
+    responses = oai.Completion.create(context=test_data[0], **config2)
     print(responses)
     oai.Completion.data = test_data[:num_samples]
     result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True)
     print("result without pruning", result)
-    result = oai.Completion.test(test_data[:num_samples], config=config)
+    result = oai.Completion.test(test_data[:num_samples], config=config2)
     print(result)
+    code, cost, selected = implement(tune_data[1], [config2, config])
+    print(selected)
+    print(eval_function_completions([code], **tune_data[1]))
 
 
 def test_math(num_samples=-1):
@@ -159,13 +166,13 @@ def test_math(num_samples=-1):
     }
     test_data_sample = test_data[0:3]
     result = oai.ChatCompletion.test(
-        test_data_sample, vanilla_config, math_success_metrics
+        test_data_sample, vanilla_config, eval_math_responses
     )
     test_data_sample = test_data[3:6]
     result = oai.ChatCompletion.test(
         test_data_sample,
         vanilla_config,
-        math_success_metrics,
+        eval_math_responses,
         use_cache=False,
         agg_method="median",
     )
@@ -179,14 +186,14 @@ def my_average(results):
     result = oai.ChatCompletion.test(
         test_data_sample,
         vanilla_config,
-        math_success_metrics,
+        eval_math_responses,
         use_cache=False,
         agg_method=my_median,
     )
     result = oai.ChatCompletion.test(
         test_data_sample,
         vanilla_config,
-        math_success_metrics,
+        eval_math_responses,
         use_cache=False,
         agg_method={
             "expected_success": my_median,
@@ -202,7 +209,7 @@ def my_average(results):
         data=tune_data,  # the data for tuning
         metric="expected_success",  # the metric to optimize
         mode="max",  # the optimization mode
-        eval_func=math_success_metrics,  # the evaluation function to return the success metrics
+        eval_func=eval_math_responses,  # the evaluation function to return the success metrics
         # log_file_name="logs/math.log",  # the log file name
         inference_budget=0.002,  # the inference budget (dollar)
         optimization_budget=0.01,  # the optimization budget (dollar)
@@ -220,4 +227,4 @@ def my_average(results):
 
     openai.api_key_path = "test/openai/key.txt"
     test_humaneval(1)
-    test_math(1)
+    # test_math(1)
diff --git a/website/docs/Examples/AutoGen-OpenAI.md b/website/docs/Examples/AutoGen-OpenAI.md
index 19ee7c61fe..81126400f3 100644
--- a/website/docs/Examples/AutoGen-OpenAI.md
+++ b/website/docs/Examples/AutoGen-OpenAI.md
@@ -62,9 +62,9 @@ Before starting tuning, you need to define the metric for the optimization. For
 
 ```python
 from functools import partial
-from flaml.autogen.code_utils import success_metrics, generate_assertions
+from flaml.autogen.code_utils import eval_function_completions, generate_assertions
 
-success_metrics_with_generated_assertions = partial(success_metrics, assertions=generate_assertions)
+eval_with_generated_assertions = partial(eval_function_completions, assertions=generate_assertions)
 ```
 
 This function will first generate assertion statements for each problem. Then, it uses the assertions to select the generated responses.
@@ -86,7 +86,7 @@ config, analysis = oai.Completion.tune(
     data=tune_data,  # the data for tuning
     metric="success",  # the metric to optimize
     mode="max",  # the optimization mode
-    eval_func=success_metrics_with_generated_assertions,  # the evaluation function to return the success metrics
+    eval_func=eval_with_generated_assertions,  # the evaluation function to return the success metrics
     # log_file_name="logs/humaneval.log",  # the log file name
     inference_budget=0.05,  # the inference budget (dollar)
     optimization_budget=3,  # the optimization budget (dollar)
@@ -118,7 +118,7 @@ We can apply the tuned config to the request for an instance:
 ```python
 responses = oai.Completion.create(context=tune_data[1], **config)
 print(responses)
-print(success_metrics(oai.Completion.extract_text(response), **tune_data[1]))
+print(eval_with_generated_assertions(oai.Completion.extract_text(response), **tune_data[1]))
 ```
 
 #### Evaluate the success rate on the test data
diff --git a/website/docs/Getting-Started.md b/website/docs/Getting-Started.md
index 67f97eeeaa..3fe35ca1ca 100644
--- a/website/docs/Getting-Started.md
+++ b/website/docs/Getting-Started.md
@@ -97,7 +97,7 @@ config, analysis = oai.Completion.tune(
     data=tune_data,
     metric="success",
     mode="max",
-    eval_func=success_metrics,
+    eval_func=eval_func,
     inference_budget=0.05,
     optimization_budget=3,
     num_samples=-1,
diff --git a/website/docs/Use-Cases/Auto-Generation.md b/website/docs/Use-Cases/Auto-Generation.md
index 5fc10a21f9..3158ed7909 100644
--- a/website/docs/Use-Cases/Auto-Generation.md
+++ b/website/docs/Use-Cases/Auto-Generation.md
@@ -88,7 +88,7 @@ config, analysis = oai.Completion.tune(
     data=tune_data,
     metric="success",
     mode="max",
-    eval_func=success_metrics,
+    eval_func=eval_func,
     inference_budget=0.05,
     optimization_budget=3,
     num_samples=-1,

From c653b22884cd71e0c93bdeb4bbef2358e77f9fb1 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 7 Apr 2023 17:54:44 +0000
Subject: [PATCH 26/27] comments about budget

---
 flaml/autogen/oai/completion.py         |  4 +-
 notebook/autogen_chatgpt.ipynb          |  4 +-
 notebook/autogen_openai.ipynb           | 60 ++++++++++++-------------
 website/docs/Examples/AutoGen-OpenAI.md |  4 +-
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index 1d755d5c56..513bcf8f21 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -512,8 +512,8 @@ def eval_func(responses, **data):
         ```
 
             log_file_name (str, optional): The log file.
-            inference_budget (float, optional): The inference budget.
-            optimization_budget (float, optional): The optimization budget.
+            inference_budget (float, optional): The inference budget, dollar per instance.
+            optimization_budget (float, optional): The optimization budget, dollar in total.
             num_samples (int, optional): The number of samples to evaluate.
                 -1 means no hard restriction in the number of trials
                 and the actual number is decided by optimization_budget. Defaults to 1.
diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb
index dcf4714688..6ac607077b 100644
--- a/notebook/autogen_chatgpt.ipynb
+++ b/notebook/autogen_chatgpt.ipynb
@@ -437,8 +437,8 @@
     "    mode=\"max\",  # the optimization mode\n",
     "    eval_func=eval_math_responses,  # the evaluation function to return the success metrics\n",
     "    # log_file_name=\"logs/math.log\",  # the log file name\n",
-    "    inference_budget=0.02,  # the inference budget (dollar)\n",
-    "    optimization_budget=1,  # the optimization budget (dollar)\n",
+    "    inference_budget=0.02,  # the inference budget (dollar per instance)\n",
+    "    optimization_budget=1,  # the optimization budget (dollar in total)\n",
     "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
     "    # -1 means decided by the optimization budget only\n",
     "    num_samples=-1,\n",
diff --git a/notebook/autogen_openai.ipynb b/notebook/autogen_openai.ipynb
index 46e507c4db..8d1c0877e2 100644
--- a/notebook/autogen_openai.ipynb
+++ b/notebook/autogen_openai.ipynb
@@ -408,7 +408,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:40.593603Z",
@@ -422,35 +422,25 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m[I 2023-04-07 15:55:46,931]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
-      "\u001b[32m[I 2023-04-07 15:55:46,934]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+      "\u001b[32m[I 2023-04-07 17:47:31,801]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
+      "\u001b[32m[I 2023-04-07 17:47:31,804]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[flaml.tune.tune: 04-07 15:55:46] {832} INFO - trial 1 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-07 15:56:03] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.010323600000000004, 'cost': 0.010323600000000004, 'inference_cost': 0.00022578, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.854185342788696}\n",
-      "[flaml.tune.tune: 04-07 15:56:03] {832} INFO - trial 2 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-07 15:56:20] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.03038410000000001, 'cost': 0.020060500000000002, 'inference_cost': 0.001003025, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 17.066588163375854}\n",
-      "[flaml.tune.tune: 04-07 15:56:20] {832} INFO - trial 3 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-07 15:56:25] {215} INFO - result: {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 4.1644368171691895}\n",
-      "[flaml.tune.tune: 04-07 15:56:25] {832} INFO - trial 4 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-07 15:56:34] {215} INFO - result: {'index_selected': 13.85, 'succeed_assertions': 0.55, 'success': 0.5, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.9526220999999998, 'cost': 0.065458, 'inference_cost': 0.0033335, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 9.954304933547974}\n",
-      "[flaml.tune.tune: 04-07 15:56:34] {832} INFO - trial 5 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
-      "[flaml.tune.tune: 04-07 15:56:34] {215} INFO - result: {'success': 0, 'total_cost': 1.1518221, 'cost': 0.1992, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 0.003031015396118164}\n",
-      "[flaml.tune.tune: 04-07 15:56:34] {832} INFO - trial 6 config: {'prompt': 0, 'stop': 0, 'subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}}\n",
-      "[flaml.tune.tune: 04-07 15:56:34] {215} INFO - result: {'success': 0, 'total_cost': 1.2810420999999999, 'cost': 0.12922, 'training_iteration': 0, 'config': {'prompt': 0, 'stop': 0, 'subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}}, 'config/prompt': 0, 'config/stop': 0, 'config/subspace': {'max_tokens': 388, 'temperature_or_top_p': {'temperature': 0.5121922633857766}, 'n': 81, 'model': 'text-davinci-003'}, 'experiment_tag': 'exp', 'time_total_s': 0.0019817352294921875}\n",
-      "[flaml.tune.tune: 04-07 15:56:34] {832} INFO - trial 7 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}}\n",
-      "[flaml.tune.tune: 04-07 15:56:38] {215} INFO - result: {'index_selected': 3.1, 'succeed_assertions': 0.9, 'success': 0.7, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 2.0943421, 'cost': 0.8133, 'inference_cost': 0.038592, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 217, 'n': 24, 'temperature_or_top_p': {'top_p': 0.6977557809981421}}, 'experiment_tag': 'exp', 'time_total_s': 3.2032766342163086}\n",
-      "[flaml.tune.tune: 04-07 15:56:38] {832} INFO - trial 8 config: {'prompt': 1, 'stop': 0, 'subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}}\n",
-      "[flaml.tune.tune: 04-07 15:56:58] {215} INFO - result: {'index_selected': 31.1, 'succeed_assertions': 0.55, 'success': 0.45, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 2.2339381000000005, 'cost': 0.139596, 'inference_cost': 0.0070404, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'max_tokens': 182, 'temperature_or_top_p': {'temperature': 0.6503971819314672}, 'n': 60, 'model': 'gpt-3.5-turbo'}, 'experiment_tag': 'exp', 'time_total_s': 20.776247024536133}\n",
-      "[flaml.tune.tune: 04-07 15:56:58] {832} INFO - trial 9 config: {'prompt': 2, 'stop': 0, 'subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}}\n",
-      "[flaml.tune.tune: 04-07 15:56:58] {215} INFO - result: {'success': 0, 'total_cost': 2.3659581000000007, 'cost': 0.13202000000000003, 'training_iteration': 0, 'config': {'prompt': 2, 'stop': 0, 'subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}}, 'config/prompt': 2, 'config/stop': 0, 'config/subspace': {'max_tokens': 257, 'temperature_or_top_p': {'temperature': 0.8568503024577332}, 'n': 35, 'model': 'text-davinci-003'}, 'experiment_tag': 'exp', 'time_total_s': 0.0025119781494140625}\n",
-      "[flaml.tune.tune: 04-07 15:56:58] {832} INFO - trial 10 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}}\n",
-      "[flaml.tune.tune: 04-07 15:56:58] {215} INFO - result: {'success': 0, 'total_cost': 3.0324381, 'cost': 0.66648, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 101, 'n': 30, 'temperature_or_top_p': {'top_p': 0.8132180150750499}}, 'experiment_tag': 'exp', 'time_total_s': 0.006719112396240234}\n",
-      "[flaml.tune.tune: 04-07 15:56:58] {855} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
+      "[flaml.tune.tune: 04-07 17:47:31] {832} INFO - trial 1 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 17:47:48] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.010323600000000004, 'cost': 0.010323600000000004, 'inference_cost': 0.00022578, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.660529136657715}\n",
+      "[flaml.tune.tune: 04-07 17:47:48] {832} INFO - trial 2 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 17:48:05] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.03038410000000001, 'cost': 0.020060500000000002, 'inference_cost': 0.001003025, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.726527452468872}\n",
+      "[flaml.tune.tune: 04-07 17:48:05] {832} INFO - trial 3 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 17:48:08] {215} INFO - result: {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 3.7132015228271484}\n",
+      "[flaml.tune.tune: 04-07 17:48:08] {832} INFO - trial 4 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 17:48:18] {215} INFO - result: {'index_selected': 13.85, 'succeed_assertions': 0.55, 'success': 0.5, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.9526220999999998, 'cost': 0.065458, 'inference_cost': 0.0033335, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 9.689077615737915}\n",
+      "[flaml.tune.tune: 04-07 17:48:18] {832} INFO - trial 5 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n",
+      "[flaml.tune.tune: 04-07 17:48:18] {215} INFO - result: {'success': 0, 'total_cost': 1.0297820999999998, 'cost': 0.07715999999999999, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 0.002007722854614258}\n",
+      "[flaml.tune.tune: 04-07 17:48:18] {855} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
      ]
     }
    ],
@@ -461,8 +451,8 @@
     "    mode=\"max\",  # the optimization mode\n",
     "    eval_func=eval_with_generated_assertions,  # the evaluation function to return the success metrics\n",
     "    # log_file_name=\"logs/humaneval.log\",  # the log file name\n",
-    "    inference_budget=0.05,  # the inference budget (dollar)\n",
-    "    optimization_budget=3,  # the optimization budget (dollar)\n",
+    "    inference_budget=0.05,  # the inference budget (dollar per instance)\n",
+    "    optimization_budget=1,  # the optimization budget (dollar in total)\n",
     "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
     "    # -1 means decided by the optimization budget only\n",
     "    num_samples=-1,\n",
@@ -487,7 +477,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:26:38.352710Z",
@@ -502,7 +492,7 @@
      "output_type": "stream",
      "text": [
       "optimized config {'prompt': '# Python 3{definition}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'model': 'text-davinci-003', 'max_tokens': 148, 'n': 27, 'top_p': 0.755486898036596}\n",
-      "best result on tuning data {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 4.1644368171691895}\n"
+      "best result on tuning data {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 3.7132015228271484}\n"
      ]
     }
    ],
@@ -527,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 16,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:26:38.359902Z",
@@ -742,7 +732,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 18,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:26:39.347295Z",
@@ -751,7 +741,15 @@
      "shell.execute_reply": "2023-02-24T23:29:27.159519Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "performance on test data with the tuned config: {'index_selected': 5.208333333333333, 'succeed_assertions': 0.8402777777777778, 'success': 0.7777777777777778, 'gen_cost': 0.00045375000000000005, 'cost': 5.785519999999999, 'inference_cost': 0.04017722222222222}\n"
+     ]
+    }
+   ],
    "source": [
     "# result = oai.Completion.test(test_data, config)\n",
     "# print(\"performance on test data with the tuned config:\", result)"
diff --git a/website/docs/Examples/AutoGen-OpenAI.md b/website/docs/Examples/AutoGen-OpenAI.md
index 81126400f3..19e35f992a 100644
--- a/website/docs/Examples/AutoGen-OpenAI.md
+++ b/website/docs/Examples/AutoGen-OpenAI.md
@@ -88,8 +88,8 @@ config, analysis = oai.Completion.tune(
     mode="max",  # the optimization mode
     eval_func=eval_with_generated_assertions,  # the evaluation function to return the success metrics
     # log_file_name="logs/humaneval.log",  # the log file name
-    inference_budget=0.05,  # the inference budget (dollar)
-    optimization_budget=3,  # the optimization budget (dollar)
+    inference_budget=0.05,  # the inference budget (dollar per instance)
+    optimization_budget=3,  # the optimization budget (dollar in total)
     # num_samples can further limit the number of trials for different hyperparameter configurations;
     # -1 means decided by the optimization budget only
     num_samples=-1,

From c845c69328041083286954c3ee259593b54a7812 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Sat, 8 Apr 2023 00:23:54 +0000
Subject: [PATCH 27/27] prompt

---
 notebook/research/math_level5counting.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebook/research/math_level5counting.ipynb b/notebook/research/math_level5counting.ipynb
index 00e306224e..5f2ba0bca3 100644
--- a/notebook/research/math_level5counting.ipynb
+++ b/notebook/research/math_level5counting.ipynb
@@ -303,7 +303,7 @@
    },
    "outputs": [],
    "source": [
-    "prompts = [\"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\"]"
+    "prompt = \"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\""
    ]
   },
   {
@@ -324,7 +324,7 @@
    "source": [
     "import logging\n",
     "\n",
-    "config_n1 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 1}\n",
+    "config_n1 = {\"model\": 'gpt-4', \"prompt\": prompt, \"max_tokens\": 600, \"n\": 1}\n",
     "n1_result = oai.ChatCompletion.test(test_data[:50], config_n1, eval_math_responses)\n",
     "print(n1_result)"
    ]