diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ecf5b070..6f85a6c8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,13 +14,24 @@ repos:
     hooks:
       - id: black
         args: ['--line-length=88']
+        exclude: ^docs/|.*\.(json|yaml|md|txt)$
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.4.2
     hooks:
       # Run the linter.
       - id: ruff
-        args: ['--fix', '--extend-ignore=E402']
+        args: ['--fix']
+        exclude: ^docs/|.*\.(json|yaml|md|txt)$
+
+  # Add local hooks to run custom commands
+  - repo: local
+    hooks:
+      - id: run-make-format
+        name: Run Make Format
+        entry: make format
+        language: system
+        pass_filenames: false
   # - repo: https://github.com/pycqa/flake8
   #   rev: 4.0.1
   #   hooks:
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..3670e02f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,51 @@
+# Define variables for common directories and commands
+PYTHON = poetry run
+SRC_DIR = .
+
+# Default target: Show help
+.PHONY: help
+help:
+	@echo "Available targets:"
+	@echo "  setup            Install dependencies and set up pre-commit hooks"
+	@echo "  format           Run Black and Ruff to format the code"
+	@echo "  lint             Run Ruff to check code quality"
+	@echo "  test             Run tests with pytest"
+	@echo "  precommit        Run pre-commit hooks on all files"
+	@echo "  clean            Clean up temporary files and build artifacts"
+
+# Install dependencies and set up pre-commit hooks
+.PHONY: setup
+setup:
+	poetry install
+	poetry run pre-commit install
+
+# Format code using Black and Ruff
+.PHONY: format
+format:
+	$(PYTHON) black $(SRC_DIR)
+	git ls-files | xargs pre-commit run black --files
+
+# Run lint checks using Ruff
+.PHONY: lint
+lint:
+	$(PYTHON) ruff check $(SRC_DIR)
+
+# Run all pre-commit hooks on all files
+.PHONY: precommit
+precommit:
+	$(PYTHON) pre-commit run --all-files
+
+# Run tests
+.PHONY: test
+test:
+	$(PYTHON) pytest
+
+# Clean up temporary files and build artifacts
+.PHONY: clean
+clean:
+	rm -rf .pytest_cache
+	rm -rf .mypy_cache
+	rm -rf __pycache__
+	rm -rf build dist *.egg-info
+	find . -type d -name "__pycache__" -exec rm -r {} +
+	find . -type f -name "*.pyc" -delete
diff --git a/adalflow/adalflow/components/model_client/anthropic_client.py b/adalflow/adalflow/components/model_client/anthropic_client.py
index 92fa535a..c6f4a34f 100644
--- a/adalflow/adalflow/components/model_client/anthropic_client.py
+++ b/adalflow/adalflow/components/model_client/anthropic_client.py
@@ -15,7 +15,8 @@
 anthropic = safe_import(
     OptionalPackages.ANTHROPIC.value[0], OptionalPackages.ANTHROPIC.value[1]
 )
-import anthropic
+
+# import anthropic
 from anthropic import (
     RateLimitError,
     APITimeoutError,
diff --git a/adalflow/adalflow/optim/optimizer.py b/adalflow/adalflow/optim/optimizer.py
index b6a68d2a..c6fad814 100644
--- a/adalflow/adalflow/optim/optimizer.py
+++ b/adalflow/adalflow/optim/optimizer.py
@@ -67,7 +67,7 @@ def __init__(
         dataset: Sequence[DataClass] = None,
         exclude_input_fields_from_bootstrap_demos: bool = False,
         *args,
-        **kwargs
+        **kwargs,
     ):
         self._weighted = weighted
         self.dataset = dataset
diff --git a/adalflow/tests/test_random_sample.py b/adalflow/tests/test_random_sample.py
index 3cc6f56f..e6abfb29 100644
--- a/adalflow/tests/test_random_sample.py
+++ b/adalflow/tests/test_random_sample.py
@@ -1,13 +1,12 @@
 import unittest
 from typing import TypeVar
+from adalflow.core.functional import random_sample
+
 
 # Assuming the random_sample function is defined here or imported
 T_co = TypeVar("T_co", covariant=True)
 
 
-from adalflow.core.functional import random_sample
-
-
 class TestRandomSample(unittest.TestCase):
 
     def setUp(self):
diff --git a/benchmarks/hotpot_qa/adal_exp/train_vanilla.py b/benchmarks/hotpot_qa/adal_exp/train_vanilla.py
index 6e87a990..b6cfe9e6 100644
--- a/benchmarks/hotpot_qa/adal_exp/train_vanilla.py
+++ b/benchmarks/hotpot_qa/adal_exp/train_vanilla.py
@@ -114,7 +114,7 @@ def train(
         **gpt_3_model,
         teacher_model_config=gpt_4o_model,
         text_optimizer_model_config=gpt_4o_model,
-        backward_engine_model_config=gpt_4o_model
+        backward_engine_model_config=gpt_4o_model,
     )
     print(adal_component)
     trainer = adal.Trainer(
diff --git a/notebooks/qas/adalflow_object_count_auto_optimization.ipynb b/notebooks/qas/adalflow_object_count_auto_optimization.ipynb
index 65b8509c..ac7e3cbf 100644
--- a/notebooks/qas/adalflow_object_count_auto_optimization.ipynb
+++ b/notebooks/qas/adalflow_object_count_auto_optimization.ipynb
@@ -1,8121 +1,8120 @@
 {
-  "cells": [
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "VVSOpjzJl_cx"
+   },
+   "source": [
+    "# 🤗 Welcome to AdalFlow!\n",
+    "## The library to build & auto-optimize any LLM task pipelines\n",
+    "\n",
+    "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help!\n",
+    "\n",
+    "\n",
+    "# Quick Links\n",
+    "\n",
+    "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n",
+    "\n",
+    "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n",
+    "\n",
+    "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n",
+    "\n",
+    "Common use cases along with the auto-optimization:  check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n",
+    "\n",
+    "# Outline\n",
+    "\n",
+    "*Note: As training can consume tokens fast, and the notebook runtime will reset everytime you use, it might be better for you to learn training in your local editor.*\n",
+    "\n",
+    "This is a quick introduction of AdalFlow on question answering use case end to end\n",
+    "\n",
+    "* Trainable Task pipeline with trainable parameters\n",
+    "* Create AdalComponent for your task pipeline\n",
+    "* Use Trainer to diagnose, debug, and to train.\n",
+    "\n",
+    "You can find all source code here: https://github.com/SylphAI-Inc/AdalFlow/tree/main/use_cases/question_answering/bhh_object_count\n",
+    "\n",
+    "**Here is the more detailed tutorial for the code here: https://adalflow.sylph.ai/use_cases/question_answering.html**\n",
+    "\n",
+    "\n",
+    "# Installation\n",
+    "\n",
+    "1. Use `pip` to install the `adalflow` Python package. We will need `openai`, `groq`, and `faiss`(cpu version) from the extra packages.\n",
+    "\n",
+    "  ```bash\n",
+    "  pip install adalflow[openai,groq,faiss-cpu]\n",
+    "  ```\n",
+    "2. Setup  `openai` and `groq` API key in the environment variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "THTvmhjgfiHE"
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import clear_output\n",
+    "\n",
+    "!pip install -U adalflow[openai,groq,datasets]\n",
+    "\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 35
+    },
+    "id": "nJteJKsNrpcu",
+    "outputId": "d9f7b4d0-d11c-480d-d858-bf9022c18998"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VVSOpjzJl_cx"
+     "data": {
+      "application/vnd.google.colaboratory.intrinsic+json": {
+       "type": "string"
       },
-      "source": [
-        "# 🤗 Welcome to AdalFlow!\n",
-        "## The library to build & auto-optimize any LLM task pipelines\n",
-        "\n",
-        "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help!\n",
-        "\n",
-        "\n",
-        "# Quick Links\n",
-        "\n",
-        "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n",
-        "\n",
-        "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n",
-        "\n",
-        "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n",
-        "\n",
-        "Common use cases along with the auto-optimization:  check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n",
-        "\n",
-        "# Outline\n",
-        "\n",
-        "*Note: As training can consume tokens fast, and the notebook runtime will reset everytime you use, it might be better for you to learn training in your local editor.*\n",
-        "\n",
-        "This is a quick introduction of AdalFlow on question answering use case end to end\n",
-        "\n",
-        "* Trainable Task pipeline with trainable parameters\n",
-        "* Create AdalComponent for your task pipeline\n",
-        "* Use Trainer to diagnose, debug, and to train.\n",
-        "\n",
-        "You can find all source code here: https://github.com/SylphAI-Inc/AdalFlow/tree/main/use_cases/question_answering/bhh_object_count\n",
-        "\n",
-        "**Here is the more detailed tutorial for the code here: https://adalflow.sylph.ai/use_cases/question_answering.html**\n",
-        "\n",
-        "\n",
-        "# Installation\n",
-        "\n",
-        "1. Use `pip` to install the `adalflow` Python package. We will need `openai`, `groq`, and `faiss`(cpu version) from the extra packages.\n",
-        "\n",
-        "  ```bash\n",
-        "  pip install adalflow[openai,groq,faiss-cpu]\n",
-        "  ```\n",
-        "2. Setup  `openai` and `groq` API key in the environment variables"
+      "text/plain": [
+       "'0.2.0'"
       ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import adalflow as adal\n",
+    "\n",
+    "adal.__version__"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KapUyHMM07pJ"
+   },
+   "source": [
+    "## Set Environment Variables\n",
+    "\n",
+    "Run the following code and pass your api key.\n",
+    "\n",
+    "Note: for normal `.py` projects, follow our [official installation guide](https://lightrag.sylph.ai/get_started/installation.html).\n",
+    "\n",
+    "*Go to [OpenAI](https://platform.openai.com/docs/introduction) and [Groq](https://console.groq.com/docs/) to get API keys if you don't already have.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "ONfzF9Puzdd_",
+    "outputId": "6a815e21-ab99-463e-c53b-e39ca2ce8f3f"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "THTvmhjgfiHE"
-      },
-      "outputs": [],
-      "source": [
-        "from IPython.display import clear_output\n",
-        "\n",
-        "!pip install -U adalflow[openai,groq,datasets]\n",
-        "\n",
-        "clear_output()"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Please enter your OpenAI API key: ··········\n",
+      "Please enter your GROQ API key: ··········\n",
+      "API keys have been set.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "from getpass import getpass\n",
+    "\n",
+    "# Prompt user to enter their API keys securely\n",
+    "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
+    "groq_api_key = getpass(\"Please enter your GROQ API key, simplly press Enter if you don't have one: \")\n",
+    "\n",
+    "\n",
+    "# Set environment variables\n",
+    "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
+    "os.environ['GROQ_API_KEY'] = groq_api_key\n",
+    "\n",
+    "print(\"API keys have been set.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SfGS7iddtfpj"
+   },
+   "source": [
+    "\n",
+    "\n",
+    "# 😇 Trainable Task Pipeline\n",
+    "\n",
+    "We will create a task pipeline consists of a generator, with a customzied template, a customized output parser.\n",
+    "\n",
+    "Different from our other pipelines where the `prompt_kwargs` values are strings, but here we will use ``Parameter``. And we will set up two parameter, one is of ``ParameterType.PROMPT`` and the other of type ``ParameterType.DEMOS``. The first one will be trained by text-grad and the second will be trained by boostrap few shot optimizer.\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nHnvAbO-pXUq"
+   },
+   "outputs": [],
+   "source": [
+    "import adalflow as adal\n",
+    "import re\n",
+    "from typing import Dict, Union\n",
+    "import adalflow as adal\n",
+    "from adalflow.optim.types import ParameterType\n",
+    "\n",
+    "\n",
+    "@adal.fun_to_component\n",
+    "def parse_integer_answer(answer: str):\n",
+    "    \"\"\"A function that parses the last integer from a string using regular expressions.\"\"\"\n",
+    "    try:\n",
+    "        # Use regular expression to find all sequences of digits\n",
+    "        numbers = re.findall(r\"\\d+\", answer)\n",
+    "        if numbers:\n",
+    "            # Get the last number found\n",
+    "            answer = int(numbers[-1])\n",
+    "        else:\n",
+    "            answer = -1\n",
+    "    except ValueError:\n",
+    "        answer = -1\n",
+    "\n",
+    "    return answer\n",
+    "\n",
+    "\n",
+    "few_shot_template = r\"\"\"<START_OF_SYSTEM_PROMPT>\n",
+    "{{system_prompt}}\n",
+    "{# Few shot demos #}\n",
+    "{% if few_shot_demos is not none %}\n",
+    "Here are some examples:\n",
+    "{{few_shot_demos}}\n",
+    "{% endif %}\n",
+    "<END_OF_SYSTEM_PROMPT>\n",
+    "<START_OF_USER>\n",
+    "{{input_str}}\n",
+    "<END_OF_USER>\n",
+    "\"\"\"\n",
+    "\n",
+    "class ObjectCountTaskPipeline(adal.Component):\n",
+    "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        system_prompt = adal.Parameter(\n",
+    "            data=\"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\",\n",
+    "            role_desc=\"To give task instruction to the language model in the system prompt\",\n",
+    "            requires_opt=True,\n",
+    "            param_type=ParameterType.PROMPT,\n",
+    "        )\n",
+    "        few_shot_demos = adal.Parameter(\n",
+    "            data=None,\n",
+    "            role_desc=\"To provide few shot demos to the language model\",\n",
+    "            requires_opt=True,  # Changed to True for few-shot learning\n",
+    "            param_type=ParameterType.DEMOS,\n",
+    "        )\n",
+    "\n",
+    "        self.llm_counter = adal.Generator(\n",
+    "            model_client=model_client,\n",
+    "            model_kwargs=model_kwargs,\n",
+    "            template=few_shot_template,\n",
+    "            prompt_kwargs={\n",
+    "                \"system_prompt\": system_prompt,\n",
+    "                \"few_shot_demos\": few_shot_demos,\n",
+    "            },\n",
+    "            output_processors=parse_integer_answer,\n",
+    "            use_cache=True,\n",
+    "        )\n",
+    "\n",
+    "    def call(\n",
+    "        self, question: str, id: str = None\n",
+    "    ) -> Union[adal.GeneratorOutput, adal.Parameter]:\n",
+    "        output = self.llm_counter(prompt_kwargs={\"input_str\": question}, id=id)\n",
+    "        return output\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "AvZJjdzZa0cT"
+   },
+   "source": [
+    "Next, we will run this pipeline in both train and eval mode.\n",
+    "\n",
+    "#### Eval mode with GeneratorOutput\n",
+    "\n",
+    "Eval mode will output ``GeneratorOutput``.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Gks3yS8hcR6_"
+   },
+   "source": [
+    "\n",
+    "#### Train mode with different form of output\n",
+    "\n",
+    "Train mode will return ``Parameter``, where the `data` field will be the `raw_response`` from the GeneratorOutput, and we put the full GeneratorOutput at the ``full_response`` in the parameter.\n",
+    "\n",
+    "As the `data` field of the `Parameter` directly communicate with the Optimizer, which are an LLM itself, its better than they understand exactly the string response itself instead of the parsed one.\n",
+    "\n",
+    "Later you will see that we also use ``eval_input`` of the parameter to communicate with the `LossFunction` as that need the parsed final output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "eqQSFnZOpfWJ",
+    "outputId": "05b5fc83-09d1-45f4-aacc-6d460fbdd7bd"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 35
-        },
-        "id": "nJteJKsNrpcu",
-        "outputId": "d9f7b4d0-d11c-480d-d858-bf9022c18998"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "type": "string"
-            },
-            "text/plain": [
-              "'0.2.0'"
-            ]
-          },
-          "execution_count": 2,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "import adalflow as adal\n",
-        "\n",
-        "adal.__version__"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KapUyHMM07pJ"
-      },
-      "source": [
-        "## Set Environment Variables\n",
-        "\n",
-        "Run the following code and pass your api key.\n",
-        "\n",
-        "Note: for normal `.py` projects, follow our [official installation guide](https://lightrag.sylph.ai/get_started/installation.html).\n",
-        "\n",
-        "*Go to [OpenAI](https://platform.openai.com/docs/introduction) and [Groq](https://console.groq.com/docs/) to get API keys if you don't already have.*"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
+      "ObjectCountTaskPipeline(\n",
+      "  (llm_counter): Generator(\n",
+      "    model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "    (prompt): Prompt(\n",
+      "      template: <START_OF_SYSTEM_PROMPT>\n",
+      "      {{system_prompt}}\n",
+      "      {# Few shot demos #}\n",
+      "      {% if few_shot_demos is not none %}\n",
+      "      Here are some examples:\n",
+      "      {{few_shot_demos}}\n",
+      "      {% endif %}\n",
+      "      <END_OF_SYSTEM_PROMPT>\n",
+      "      <START_OF_USER>\n",
+      "      {{input_str}}\n",
+      "      <END_OF_USER>\n",
+      "      , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "    )\n",
+      "    (model_client): OpenAIClient()\n",
+      "    (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "  )\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.components.model_client.openai_client import OpenAIClient\n",
+    "from adalflow.components.model_client.groq_client import GroqAPIClient\n",
+    "\n",
+    "\n",
+    "if len(os.environ['OPENAI_API_KEY']) > 1:\n",
+    "  gpt_3_model = {\n",
+    "      \"model_client\": OpenAIClient(),\n",
+    "      \"model_kwargs\": {\n",
+    "          \"model\": \"gpt-3.5-turbo\",\n",
+    "          \"max_tokens\": 2000,\n",
+    "          \"temperature\": 0.0,\n",
+    "          \"top_p\": 0.99,\n",
+    "          \"frequency_penalty\": 0,\n",
+    "          \"presence_penalty\": 0,\n",
+    "          \"stop\": None,\n",
+    "      },\n",
+    "  }\n",
+    "  gpt_4o_model = {\n",
+    "      \"model_client\": OpenAIClient(),\n",
+    "      \"model_kwargs\": {\n",
+    "          \"model\": \"gpt-4o\",\n",
+    "          \"max_tokens\": 4000,\n",
+    "          \"temperature\": 0.0,\n",
+    "          \"top_p\": 0.99,\n",
+    "          \"frequency_penalty\": 0,\n",
+    "          \"presence_penalty\": 0,\n",
+    "          \"stop\": None,\n",
+    "      },\n",
+    "  }\n",
+    "\n",
+    "if len(os.environ['GROQ_API_KEY']) > 1:\n",
+    "  llama_3_1_model ={\n",
+    "      \"model_client\": GroqAPIClient(),\n",
+    "      \"model_kwargs\": {\n",
+    "          \"model\": \"llama-3.1-8b-instant\"\n",
+    "      }\n",
+    "  }\n",
+    "\n",
+    "\n",
+    "question = \"I have a flute, a piano, a trombone, four stoves, a violin, an accordion, a clarinet, a drum, two lamps, and a trumpet. How many musical instruments do I have?\"\n",
+    "task_pipeline = ObjectCountTaskPipeline(**gpt_3_model)\n",
+    "print(task_pipeline)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "DE1xNdYvcXw8",
+    "outputId": "25844c2a-5d4c-4c68-8ca5-38b79ca5b398"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ONfzF9Puzdd_",
-        "outputId": "6a815e21-ab99-463e-c53b-e39ca2ce8f3f"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Please enter your OpenAI API key: ··········\n",
-            "Please enter your GROQ API key: ··········\n",
-            "API keys have been set.\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "\n",
-        "from getpass import getpass\n",
-        "\n",
-        "# Prompt user to enter their API keys securely\n",
-        "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
-        "groq_api_key = getpass(\"Please enter your GROQ API key, simplly press Enter if you don't have one: \")\n",
-        "\n",
-        "\n",
-        "# Set environment variables\n",
-        "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
-        "os.environ['GROQ_API_KEY'] = groq_api_key\n",
-        "\n",
-        "print(\"API keys have been set.\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GeneratorOutput(id='1', data=8, error=None, usage=CompletionUsage(completion_tokens=77, prompt_tokens=113, total_tokens=190), raw_response='To find the total number of musical instruments you have, you simply need to count the individual instruments you listed. \\n\\nYou have:\\n- Flute\\n- Piano\\n- Trombone\\n- Violin\\n- Accordion\\n- Clarinet\\n- Drum\\n- Trumpet\\n\\nCounting each of these instruments, we get a total of 8 musical instruments.\\n\\nAnswer: 8', metadata=None)\n"
+     ]
+    }
+   ],
+   "source": [
+    "answer = task_pipeline(question, id=\"1\")\n",
+    "print(answer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "AGUlUsGxcaby",
+    "outputId": "8c8588fe-2994-4d9e-c2d1-26453141f43f"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "SfGS7iddtfpj"
-      },
-      "source": [
-        "\n",
-        "\n",
-        "# 😇 Trainable Task Pipeline\n",
-        "\n",
-        "We will create a task pipeline consists of a generator, with a customzied template, a customized output parser.\n",
-        "\n",
-        "Different from our other pipelines where the `prompt_kwargs` values are strings, but here we will use ``Parameter``. And we will set up two parameter, one is of ``ParameterType.PROMPT`` and the other of type ``ParameterType.DEMOS``. The first one will be trained by text-grad and the second will be trained by boostrap few shot optimizer.\n",
-        "\n",
-        "\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter(name=Generator_output, requires_opt=True, param_type=generator_output (The output of the generator.), role_desc=Output from (llm) Generator, data=To find the total number of musical instruments you have, you simply need to count the individual instruments you listed. \n",
+      "\n",
+      "You have:\n",
+      "- Flute\n",
+      "- Piano\n",
+      "- Trombone\n",
+      "- Violin\n",
+      "- Accordion\n",
+      "- Clarinet\n",
+      "- Drum\n",
+      "- Trumpet\n",
+      "\n",
+      "Counting each of these instruments, we get a total of 8 musical instruments.\n",
+      "\n",
+      "Answer: 8, predecessors={Parameter(name=To_provide, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=To provide few shot demos to the language model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=To_give_ta, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=To give task instruction to the language model in the system prompt, data=You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={})}, gradients=[],            raw_response=None, input_args={'prompt_kwargs': {'system_prompt': Parameter(name=To_give_ta, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=To give task instruction to the language model in the system prompt, data=You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'few_shot_demos': Parameter(name=To_provide, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=To provide few shot demos to the language model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'input_str': 'I have a flute, a piano, a trombone, four stoves, a violin, an accordion, a clarinet, a drum, two lamps, and a trumpet. How many musical instruments do I have?'}, 'model_kwargs': {'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, traces={})\n",
+      "full_response: GeneratorOutput(id=None, data=8, error=None, usage=CompletionUsage(completion_tokens=77, prompt_tokens=113, total_tokens=190), raw_response='To find the total number of musical instruments you have, you simply need to count the individual instruments you listed. \\n\\nYou have:\\n- Flute\\n- Piano\\n- Trombone\\n- Violin\\n- Accordion\\n- Clarinet\\n- Drum\\n- Trumpet\\n\\nCounting each of these instruments, we get a total of 8 musical instruments.\\n\\nAnswer: 8', metadata=None)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# set it to train mode\n",
+    "task_pipeline.train()\n",
+    "answer = task_pipeline(question, id=\"1\")\n",
+    "print(answer)\n",
+    "print(f\"full_response: {answer.full_response}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "YDAiuFzcr4YA"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install datasets\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-Gvfcy2IcgWx"
+   },
+   "source": [
+    "### Load Datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "AYBIGsIHpjMe"
+   },
+   "outputs": [],
+   "source": [
+    "from adalflow.datasets.big_bench_hard import BigBenchHard\n",
+    "from adalflow.utils.data import subset_dataset\n",
+    "\n",
+    "def load_datasets(max_samples: int = None):\n",
+    "    \"\"\"Load the dataset\"\"\"\n",
+    "    train_data = BigBenchHard(split=\"train\")\n",
+    "    val_data = BigBenchHard(split=\"val\")\n",
+    "    test_data = BigBenchHard(split=\"test\")\n",
+    "\n",
+    "    # Limit the number of samples\n",
+    "    if max_samples:\n",
+    "        train_data = subset_dataset(train_data, max_samples)\n",
+    "        val_data = subset_dataset(val_data, max_samples)\n",
+    "        test_data = subset_dataset(test_data, max_samples)\n",
+    "\n",
+    "    return train_data, val_data, test_data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "asw-pJrid8ly",
+    "outputId": "31807c34-0de9-45e5-ebdd-778aa5313802"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "nHnvAbO-pXUq"
-      },
-      "outputs": [],
-      "source": [
-        "import adalflow as adal\n",
-        "import re\n",
-        "from typing import Dict, Union\n",
-        "import adalflow as adal\n",
-        "from adalflow.optim.types import ParameterType\n",
-        "\n",
-        "\n",
-        "@adal.fun_to_component\n",
-        "def parse_integer_answer(answer: str):\n",
-        "    \"\"\"A function that parses the last integer from a string using regular expressions.\"\"\"\n",
-        "    try:\n",
-        "        # Use regular expression to find all sequences of digits\n",
-        "        numbers = re.findall(r\"\\d+\", answer)\n",
-        "        if numbers:\n",
-        "            # Get the last number found\n",
-        "            answer = int(numbers[-1])\n",
-        "        else:\n",
-        "            answer = -1\n",
-        "    except ValueError:\n",
-        "        answer = -1\n",
-        "\n",
-        "    return answer\n",
-        "\n",
-        "\n",
-        "few_shot_template = r\"\"\"<START_OF_SYSTEM_PROMPT>\n",
-        "{{system_prompt}}\n",
-        "{# Few shot demos #}\n",
-        "{% if few_shot_demos is not none %}\n",
-        "Here are some examples:\n",
-        "{{few_shot_demos}}\n",
-        "{% endif %}\n",
-        "<END_OF_SYSTEM_PROMPT>\n",
-        "<START_OF_USER>\n",
-        "{{input_str}}\n",
-        "<END_OF_USER>\n",
-        "\"\"\"\n",
-        "\n",
-        "class ObjectCountTaskPipeline(adal.Component):\n",
-        "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
-        "        super().__init__()\n",
-        "\n",
-        "        system_prompt = adal.Parameter(\n",
-        "            data=\"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\",\n",
-        "            role_desc=\"To give task instruction to the language model in the system prompt\",\n",
-        "            requires_opt=True,\n",
-        "            param_type=ParameterType.PROMPT,\n",
-        "        )\n",
-        "        few_shot_demos = adal.Parameter(\n",
-        "            data=None,\n",
-        "            role_desc=\"To provide few shot demos to the language model\",\n",
-        "            requires_opt=True,  # Changed to True for few-shot learning\n",
-        "            param_type=ParameterType.DEMOS,\n",
-        "        )\n",
-        "\n",
-        "        self.llm_counter = adal.Generator(\n",
-        "            model_client=model_client,\n",
-        "            model_kwargs=model_kwargs,\n",
-        "            template=few_shot_template,\n",
-        "            prompt_kwargs={\n",
-        "                \"system_prompt\": system_prompt,\n",
-        "                \"few_shot_demos\": few_shot_demos,\n",
-        "            },\n",
-        "            output_processors=parse_integer_answer,\n",
-        "            use_cache=True,\n",
-        "        )\n",
-        "\n",
-        "    def call(\n",
-        "        self, question: str, id: str = None\n",
-        "    ) -> Union[adal.GeneratorOutput, adal.Parameter]:\n",
-        "        output = self.llm_counter(prompt_kwargs={\"input_str\": question}, id=id)\n",
-        "        return output\n",
-        "\n",
-        "\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Example(id='d3f33ded-170a-4b87-9b0b-987d5fb7b817', question='I have a cauliflower, a stalk of celery, a cabbage, and a garlic. How many vegetables do I have?', answer='4')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check the datasets\n",
+    "\n",
+    "train_data, val_data, test_data = load_datasets(max_samples=2)\n",
+    "print(train_data[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "VAVtXE9xeEHt"
+   },
+   "source": [
+    "### Soft link to AdalFlow default file path\n",
+    "\n",
+    "Lets' match the default to the current project, so that you can see the downloaded data and later the checkpoints of the training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "1SaKH6dkeWus"
+   },
+   "outputs": [],
+   "source": [
+    "! ln -s /root/.adalflow /content/adalflow\n",
+    "\n",
+    "# go to files then you will see a folder named as adalflow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YWZzOvAHenME"
+   },
+   "source": [
+    "# 😊 AdalComponent to define everything we need to train\n",
+    "\n",
+    "1. We need `backward_engine_model_config`` for ``backward_engine`` to compute gradient.\n",
+    "\n",
+    "2. We need ``text_optimizer_model_config`` for the `text optimizer` for propose new prompts.\n",
+    "\n",
+    "3. For the demo optimizer, we need a `teacher_model_config` to config a teacher generator, in this case, it is the `llm_counter`. The teacher will share the same prompt with the `llm_counter` but you can use a more advanced model.\n",
+    "\n",
+    "In general, we should have all of these parts to use a more advanced model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9QoNoMWD0rgV"
+   },
+   "source": [
+    "## 🧑 Diagnose\n",
+    "\n",
+    "Diagnose is more of an evaluation, but with detailed logs so that you can manually inspect the wrong output.\n",
+    "\n",
+    "This one shows the minimum config you need to get the `diagnose` work."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6mi7lM3U24Eg"
+   },
+   "outputs": [],
+   "source": [
+    "from adalflow.datasets.types import Example\n",
+    "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
+    "\n",
+    "\n",
+    "class ObjectCountAdalComponent(adal.AdalComponent):\n",
+    "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
+    "        task = ObjectCountTaskPipeline(model_client, model_kwargs)\n",
+    "        eval_fn = AnswerMatchAcc(type=\"exact_match\").compute_single_item\n",
+    "        super().__init__(task=task, eval_fn=eval_fn)\n",
+    "\n",
+    "    def prepare_task(self, sample: Example):\n",
+    "        return self.task.call, {\"question\": sample.question, \"id\": sample.id}\n",
+    "\n",
+    "    def prepare_eval(\n",
+    "        self, sample: Example, y_pred: adal.GeneratorOutput\n",
+    "    ) -> float:\n",
+    "        y_label = -1\n",
+    "        if (y_pred is not None and y_pred.data is not None):  # if y_pred and y_pred.data: might introduce bug when the data is 0\n",
+    "            y_label = y_pred.data\n",
+    "        return self.eval_fn, {\"y\": y_label, \"y_gt\": sample.answer}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "eliPeVeM2wcP"
+   },
+   "outputs": [],
+   "source": [
+    "def diagnose(\n",
+    "    model_client: adal.ModelClient,\n",
+    "    model_kwargs: Dict,\n",
+    ") -> Dict:\n",
+    "\n",
+    "    trainset, valset, testset = load_datasets()\n",
+    "    # use max_samples=10 to test the code\n",
+    "\n",
+    "    adal_component = ObjectCountAdalComponent(model_client, model_kwargs)\n",
+    "    trainer = adal.Trainer(adaltask=adal_component)\n",
+    "    trainer.diagnose(dataset=trainset, split=\"train\")\n",
+    "    trainer.diagnose(dataset=valset, split=\"val\")\n",
+    "    trainer.diagnose(dataset=testset, split=\"test\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "nKl9clcb3dFj",
+    "outputId": "676fbb96-c70b-40ab-ea15-93ade1aa9e66"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AvZJjdzZa0cT"
-      },
-      "source": [
-        "Next, we will run this pipeline in both train and eval mode.\n",
-        "\n",
-        "#### Eval mode with GeneratorOutput\n",
-        "\n",
-        "Eval mode will output ``GeneratorOutput``.\n",
-        "\n"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Gks3yS8hcR6_"
-      },
-      "source": [
-        "\n",
-        "#### Train mode with different form of output\n",
-        "\n",
-        "Train mode will return ``Parameter``, where the `data` field will be the `raw_response`` from the GeneratorOutput, and we put the full GeneratorOutput at the ``full_response`` in the parameter.\n",
-        "\n",
-        "As the `data` field of the `Parameter` directly communicate with the Optimizer, which are an LLM itself, its better than they understand exactly the string response itself instead of the parsed one.\n",
-        "\n",
-        "Later you will see that we also use ``eval_input`` of the parameter to communicate with the `LossFunction` as that need the parsed final output."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
+      "Checkpoint path: /root/.adalflow/ckpt/ObjectCountAdalComponent\n",
+      "Save diagnose to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_train\n",
+      "Saving traces to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_train\n",
+      "all_generators: [('llm_counter', Generator(\n",
+      "  model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "))]\n",
+      "Registered callback for llm_counter, file path: /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_train/llm_counter_call.jsonl\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "eqQSFnZOpfWJ",
-        "outputId": "05b5fc83-09d1-45f4-aacc-6d460fbdd7bd"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
-            "ObjectCountTaskPipeline(\n",
-            "  (llm_counter): Generator(\n",
-            "    model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "    (prompt): Prompt(\n",
-            "      template: <START_OF_SYSTEM_PROMPT>\n",
-            "      {{system_prompt}}\n",
-            "      {# Few shot demos #}\n",
-            "      {% if few_shot_demos is not none %}\n",
-            "      Here are some examples:\n",
-            "      {{few_shot_demos}}\n",
-            "      {% endif %}\n",
-            "      <END_OF_SYSTEM_PROMPT>\n",
-            "      <START_OF_USER>\n",
-            "      {{input_str}}\n",
-            "      <END_OF_USER>\n",
-            "      , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "    )\n",
-            "    (model_client): OpenAIClient()\n",
-            "    (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "  )\n",
-            ")\n"
-          ]
-        }
-      ],
-      "source": [
-        "from adalflow.components.model_client.openai_client import OpenAIClient\n",
-        "from adalflow.components.model_client.groq_client import GroqAPIClient\n",
-        "\n",
-        "\n",
-        "if len(os.environ['OPENAI_API_KEY']) > 1:\n",
-        "  gpt_3_model = {\n",
-        "      \"model_client\": OpenAIClient(),\n",
-        "      \"model_kwargs\": {\n",
-        "          \"model\": \"gpt-3.5-turbo\",\n",
-        "          \"max_tokens\": 2000,\n",
-        "          \"temperature\": 0.0,\n",
-        "          \"top_p\": 0.99,\n",
-        "          \"frequency_penalty\": 0,\n",
-        "          \"presence_penalty\": 0,\n",
-        "          \"stop\": None,\n",
-        "      },\n",
-        "  }\n",
-        "  gpt_4o_model = {\n",
-        "      \"model_client\": OpenAIClient(),\n",
-        "      \"model_kwargs\": {\n",
-        "          \"model\": \"gpt-4o\",\n",
-        "          \"max_tokens\": 4000,\n",
-        "          \"temperature\": 0.0,\n",
-        "          \"top_p\": 0.99,\n",
-        "          \"frequency_penalty\": 0,\n",
-        "          \"presence_penalty\": 0,\n",
-        "          \"stop\": None,\n",
-        "      },\n",
-        "  }\n",
-        "\n",
-        "if len(os.environ['GROQ_API_KEY']) > 1:\n",
-        "  llama_3_1_model ={\n",
-        "      \"model_client\": GroqAPIClient(),\n",
-        "      \"model_kwargs\": {\n",
-        "          \"model\": \"llama-3.1-8b-instant\"\n",
-        "      }\n",
-        "  }\n",
-        "\n",
-        "\n",
-        "question = \"I have a flute, a piano, a trombone, four stoves, a violin, an accordion, a clarinet, a drum, two lamps, and a trumpet. How many musical instruments do I have?\"\n",
-        "task_pipeline = ObjectCountTaskPipeline(**gpt_3_model)\n",
-        "print(task_pipeline)\n"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 5957.82it/s]\n",
+      "Evaluating step(0): 0.88 across 50 samples, Max potential: 0.88: 100%|██████████| 50/50 [00:15<00:00,  3.27it/s]\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "DE1xNdYvcXw8",
-        "outputId": "25844c2a-5d4c-4c68-8ca5-38b79ca5b398"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "GeneratorOutput(id='1', data=8, error=None, usage=CompletionUsage(completion_tokens=77, prompt_tokens=113, total_tokens=190), raw_response='To find the total number of musical instruments you have, you simply need to count the individual instruments you listed. \\n\\nYou have:\\n- Flute\\n- Piano\\n- Trombone\\n- Violin\\n- Accordion\\n- Clarinet\\n- Drum\\n- Trumpet\\n\\nCounting each of these instruments, we get a total of 8 musical instruments.\\n\\nAnswer: 8', metadata=None)\n"
-          ]
-        }
-      ],
-      "source": [
-        "answer = task_pipeline(question, id=\"1\")\n",
-        "print(answer)"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sorted_indices: [8, 16, 23, 25, 31, 47, 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49]\n",
+      "sorted_scores: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]\n",
+      "Loading log file: llm_counter_call.jsonl\n",
+      "Total error samples: 6\n",
+      "Save diagnose to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_val\n",
+      "Saving traces to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_val\n",
+      "all_generators: [('llm_counter', Generator(\n",
+      "  model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "))]\n",
+      "Registered callback for llm_counter, file path: /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_val/llm_counter_call.jsonl\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "AGUlUsGxcaby",
-        "outputId": "8c8588fe-2994-4d9e-c2d1-26453141f43f"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Parameter(name=Generator_output, requires_opt=True, param_type=generator_output (The output of the generator.), role_desc=Output from (llm) Generator, data=To find the total number of musical instruments you have, you simply need to count the individual instruments you listed. \n",
-            "\n",
-            "You have:\n",
-            "- Flute\n",
-            "- Piano\n",
-            "- Trombone\n",
-            "- Violin\n",
-            "- Accordion\n",
-            "- Clarinet\n",
-            "- Drum\n",
-            "- Trumpet\n",
-            "\n",
-            "Counting each of these instruments, we get a total of 8 musical instruments.\n",
-            "\n",
-            "Answer: 8, predecessors={Parameter(name=To_provide, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=To provide few shot demos to the language model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=To_give_ta, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=To give task instruction to the language model in the system prompt, data=You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={})}, gradients=[],            raw_response=None, input_args={'prompt_kwargs': {'system_prompt': Parameter(name=To_give_ta, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=To give task instruction to the language model in the system prompt, data=You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'few_shot_demos': Parameter(name=To_provide, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=To provide few shot demos to the language model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'input_str': 'I have a flute, a piano, a trombone, four stoves, a violin, an accordion, a clarinet, a drum, two lamps, and a trumpet. How many musical instruments do I have?'}, 'model_kwargs': {'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, traces={})\n",
-            "full_response: GeneratorOutput(id=None, data=8, error=None, usage=CompletionUsage(completion_tokens=77, prompt_tokens=113, total_tokens=190), raw_response='To find the total number of musical instruments you have, you simply need to count the individual instruments you listed. \\n\\nYou have:\\n- Flute\\n- Piano\\n- Trombone\\n- Violin\\n- Accordion\\n- Clarinet\\n- Drum\\n- Trumpet\\n\\nCounting each of these instruments, we get a total of 8 musical instruments.\\n\\nAnswer: 8', metadata=None)\n"
-          ]
-        }
-      ],
-      "source": [
-        "# set it to train mode\n",
-        "task_pipeline.train()\n",
-        "answer = task_pipeline(question, id=\"1\")\n",
-        "print(answer)\n",
-        "print(f\"full_response: {answer.full_response}\")"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3203.76it/s]\n",
+      "Evaluating step(0): 0.8 across 50 samples, Max potential: 0.8: 100%|██████████| 50/50 [00:15<00:00,  3.26it/s]\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "YDAiuFzcr4YA"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install datasets\n",
-        "clear_output()"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sorted_indices: [1, 2, 5, 10, 24, 36, 38, 42, 44, 47, 0, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 40, 41, 43, 45, 46, 48, 49]\n",
+      "sorted_scores: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]\n",
+      "Loading log file: llm_counter_call.jsonl\n",
+      "Total error samples: 10\n",
+      "Save diagnose to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_test\n",
+      "Saving traces to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_test\n",
+      "all_generators: [('llm_counter', Generator(\n",
+      "  model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "))]\n",
+      "Registered callback for llm_counter, file path: /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_test/llm_counter_call.jsonl\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-Gvfcy2IcgWx"
-      },
-      "source": [
-        "### Load Datasets"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 5545.09it/s]\n",
+      "Evaluating step(0): 0.83 across 100 samples, Max potential: 0.83: 100%|██████████| 100/100 [00:28<00:00,  3.50it/s]"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "AYBIGsIHpjMe"
-      },
-      "outputs": [],
-      "source": [
-        "from adalflow.datasets.big_bench_hard import BigBenchHard\n",
-        "from adalflow.utils.data import subset_dataset\n",
-        "\n",
-        "def load_datasets(max_samples: int = None):\n",
-        "    \"\"\"Load the dataset\"\"\"\n",
-        "    train_data = BigBenchHard(split=\"train\")\n",
-        "    val_data = BigBenchHard(split=\"val\")\n",
-        "    test_data = BigBenchHard(split=\"test\")\n",
-        "\n",
-        "    # Limit the number of samples\n",
-        "    if max_samples:\n",
-        "        train_data = subset_dataset(train_data, max_samples)\n",
-        "        val_data = subset_dataset(val_data, max_samples)\n",
-        "        test_data = subset_dataset(test_data, max_samples)\n",
-        "\n",
-        "    return train_data, val_data, test_data\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sorted_indices: [7, 18, 19, 20, 23, 24, 25, 43, 58, 59, 63, 74, 75, 79, 85, 97, 99, 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 76, 77, 78, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 98]\n",
+      "sorted_scores: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]\n",
+      "Loading log file: llm_counter_call.jsonl\n",
+      "Total error samples: 17\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "asw-pJrid8ly",
-        "outputId": "31807c34-0de9-45e5-ebdd-778aa5313802"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Example(id='d3f33ded-170a-4b87-9b0b-987d5fb7b817', question='I have a cauliflower, a stalk of celery, a cabbage, and a garlic. How many vegetables do I have?', answer='4')\n"
-          ]
-        }
-      ],
-      "source": [
-        "# check the datasets\n",
-        "\n",
-        "train_data, val_data, test_data = load_datasets(max_samples=2)\n",
-        "print(train_data[0])"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "diagnose(**gpt_3_model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "dSu4VQri3y3D"
+   },
+   "source": [
+    "Now, you can go to `/content/adalflow/ckpt/ObjectCountAdalComponent/diagnose_train/stats.json` to view the average score for each split. And also the `diagnose.json` for different errors.\n",
+    "\n",
+    "Here is the overall score for each split.\n",
+    "\n",
+    "| Train  | Val| Test |\n",
+    "|:--------- |:--------:| ---------:|\n",
+    "| 0.88      | 0.8   |    0.83  |\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1vzJyp-W0z7I"
+   },
+   "source": [
+    "## 🐛 Debug"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TmlCvJu804dJ"
+   },
+   "source": [
+    "## ✅ Train\n",
+    "\n",
+    "Now, let's start training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4TWCn0did6-K"
+   },
+   "outputs": [],
+   "source": [
+    "from adalflow.datasets.types import Example\n",
+    "\n",
+    "\n",
+    "class ObjectCountAdalComponent(adal.AdalComponent):# noqa: F811\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        model_client: adal.ModelClient,\n",
+    "        model_kwargs: Dict,\n",
+    "        backward_engine_model_config: Dict,\n",
+    "        teacher_model_config: Dict,\n",
+    "        text_optimizer_model_config: Dict,\n",
+    "    ):\n",
+    "        task = ObjectCountTaskPipeline(model_client, model_kwargs)\n",
+    "        eval_fn = AnswerMatchAcc(type=\"exact_match\").compute_single_item\n",
+    "        loss_fn = adal.EvalFnToTextLoss(\n",
+    "            eval_fn=eval_fn,\n",
+    "            eval_fn_desc=\"exact_match: 1 if str(y) == str(y_gt) else 0\",\n",
+    "        )\n",
+    "        super().__init__(task=task, eval_fn=eval_fn, loss_fn=loss_fn)\n",
+    "\n",
+    "        self.backward_engine_model_config = backward_engine_model_config\n",
+    "        self.teacher_model_config = teacher_model_config\n",
+    "        self.text_optimizer_model_config = text_optimizer_model_config\n",
+    "\n",
+    "    def prepare_task(self, sample: Example):\n",
+    "        return self.task.call, {\"question\": sample.question, \"id\": sample.id}\n",
+    "\n",
+    "\n",
+    "    def prepare_eval(\n",
+    "        self, sample: Example, y_pred: adal.GeneratorOutput\n",
+    "    ) -> float:\n",
+    "        y_label = -1\n",
+    "        if (y_pred is not None and y_pred.data is not None):  # if y_pred and y_pred.data: might introduce bug when the data is 0\n",
+    "            y_label = y_pred.data\n",
+    "        return self.eval_fn, {\"y\": y_label, \"y_gt\": sample.answer}\n",
+    "\n",
+    "    def prepare_loss(self, sample: Example, pred: adal.Parameter):\n",
+    "        # prepare gt parameter\n",
+    "        y_gt = adal.Parameter(\n",
+    "            name=\"y_gt\",\n",
+    "            data=sample.answer,\n",
+    "            eval_input=sample.answer,\n",
+    "            requires_opt=False,\n",
+    "        )\n",
+    "\n",
+    "        # pred's full_response is the output of the task pipeline which is GeneratorOutput\n",
+    "        pred.eval_input = pred.full_response.data\n",
+    "        return self.loss_fn, {\"kwargs\": {\"y\": pred, \"y_gt\": y_gt}}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "dezwX2yn1eQS"
+   },
+   "outputs": [],
+   "source": [
+    "def train(\n",
+    "    train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle\n",
+    "    raw_shots: int = 0,\n",
+    "    bootstrap_shots: int = 1,\n",
+    "    max_steps=1,\n",
+    "    num_workers=4,\n",
+    "    strategy=\"random\",\n",
+    "    optimization_order=\"sequential\",\n",
+    "    debug=False,\n",
+    "    resume_from_ckpt=None,\n",
+    "    exclude_input_fields_from_bootstrap_demos=False,\n",
+    "):\n",
+    "    adal_component = ObjectCountAdalComponent(\n",
+    "        **gpt_3_model,\n",
+    "        teacher_model_config=gpt_4o_model,\n",
+    "        text_optimizer_model_config=gpt_4o_model,\n",
+    "        backward_engine_model_config=gpt_4o_model\n",
+    "    )\n",
+    "    print(adal_component)\n",
+    "    trainer = adal.Trainer(\n",
+    "        train_batch_size=train_batch_size,\n",
+    "        adaltask=adal_component,\n",
+    "        strategy=strategy,\n",
+    "        max_steps=max_steps,\n",
+    "        num_workers=num_workers,\n",
+    "        raw_shots=raw_shots,\n",
+    "        bootstrap_shots=bootstrap_shots,\n",
+    "        debug=debug,\n",
+    "        weighted_sampling=True,\n",
+    "        optimization_order=optimization_order,\n",
+    "        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,\n",
+    "    )\n",
+    "    print(trainer)\n",
+    "\n",
+    "    train_dataset, val_dataset, test_dataset = load_datasets()\n",
+    "    trainer.fit(\n",
+    "        train_dataset=train_dataset,\n",
+    "        val_dataset=val_dataset,\n",
+    "        test_dataset=test_dataset,\n",
+    "        debug=debug,\n",
+    "        resume_from_ckpt=resume_from_ckpt,\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "NGKYozGt60Pp"
+   },
+   "source": [
+    "We use `Sequential` in default, we will end up with 24 steps in total, 12 for text optimizer and 12 for the demo optimizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "yDwLwL0L7Rsw",
+    "outputId": "1b7e413b-a1d3-4388-fc0c-ca4b1c072585"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VAVtXE9xeEHt"
-      },
-      "source": [
-        "### Soft link to AdalFlow default file path\n",
-        "\n",
-        "Lets' match the default to the current project, so that you can see the downloaded data and later the checkpoints of the training."
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "1SaKH6dkeWus"
-      },
-      "outputs": [],
-      "source": [
-        "! ln -s /root/.adalflow /content/adalflow\n",
-        "\n",
-        "# go to files then you will see a folder named as adalflow"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
+      "ObjectCountAdalComponent(\n",
+      "  eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
+      "  (task): ObjectCountTaskPipeline(\n",
+      "    (llm_counter): Generator(\n",
+      "      model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "      (prompt): Prompt(\n",
+      "        template: <START_OF_SYSTEM_PROMPT>\n",
+      "        {{system_prompt}}\n",
+      "        {# Few shot demos #}\n",
+      "        {% if few_shot_demos is not none %}\n",
+      "        Here are some examples:\n",
+      "        {{few_shot_demos}}\n",
+      "        {% endif %}\n",
+      "        <END_OF_SYSTEM_PROMPT>\n",
+      "        <START_OF_USER>\n",
+      "        {{input_str}}\n",
+      "        <END_OF_USER>\n",
+      "        , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "      )\n",
+      "      (model_client): OpenAIClient()\n",
+      "      (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "    )\n",
+      "  )\n",
+      "  (loss_fn): EvalFnToTextLoss()\n",
+      ")\n",
+      "Trainer(\n",
+      "  (adaltask): ObjectCountAdalComponent(\n",
+      "    eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
+      "    (task): ObjectCountTaskPipeline(\n",
+      "      (llm_counter): Generator(\n",
+      "        model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "        (prompt): Prompt(\n",
+      "          template: <START_OF_SYSTEM_PROMPT>\n",
+      "          {{system_prompt}}\n",
+      "          {# Few shot demos #}\n",
+      "          {% if few_shot_demos is not none %}\n",
+      "          Here are some examples:\n",
+      "          {{few_shot_demos}}\n",
+      "          {% endif %}\n",
+      "          <END_OF_SYSTEM_PROMPT>\n",
+      "          <START_OF_USER>\n",
+      "          {{input_str}}\n",
+      "          <END_OF_USER>\n",
+      "          , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "        )\n",
+      "        (model_client): OpenAIClient()\n",
+      "        (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "      )\n",
+      "    )\n",
+      "    (loss_fn): EvalFnToTextLoss()\n",
+      "  )\n",
+      ")\n",
+      "raw_shots: 0, bootstrap_shots: 1\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator configured.\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Backward engine configured for all generators.\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "YWZzOvAHenME"
-      },
-      "source": [
-        "# 😊 AdalComponent to define everything we need to train\n",
-        "\n",
-        "1. We need `backward_engine_model_config`` for ``backward_engine`` to compute gradient.\n",
-        "\n",
-        "2. We need ``text_optimizer_model_config`` for the `text optimizer` for propose new prompts.\n",
-        "\n",
-        "3. For the demo optimizer, we need a `teacher_model_config` to config a teacher generator, in this case, it is the `llm_counter`. The teacher will share the same prompt with the `llm_counter` but you can use a more advanced model.\n",
-        "\n",
-        "In general, we should have all of these parts to use a more advanced model."
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 6482.70it/s]\n",
+      "Evaluating step(0): 0.8 across 50 samples, Max potential: 0.8: 100%|██████████| 50/50 [00:00<00:00, 347.01it/s]\n",
+      "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 2017.67it/s]\n",
+      "Evaluating step(0): 0.83 across 100 samples, Max potential: 0.83: 100%|██████████| 100/100 [00:00<00:00, 286.59it/s]\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9QoNoMWD0rgV"
-      },
-      "source": [
-        "## 🧑 Diagnose\n",
-        "\n",
-        "Diagnose is more of an evaluation, but with detailed logs so that you can manually inspect the wrong output.\n",
-        "\n",
-        "This one shows the minimum config you need to get the `diagnose` work."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial validation score: 0.8\n",
+      "Initial test score: 0.83\n",
+      "Checkpoint path: /root/.adalflow/ckpt/ObjectCountAdalComponent\n",
+      "save to /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "6mi7lM3U24Eg"
-      },
-      "outputs": [],
-      "source": [
-        "from adalflow.datasets.types import Example\n",
-        "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
-        "\n",
-        "\n",
-        "class ObjectCountAdalComponent(adal.AdalComponent):\n",
-        "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
-        "        task = ObjectCountTaskPipeline(model_client, model_kwargs)\n",
-        "        eval_fn = AnswerMatchAcc(type=\"exact_match\").compute_single_item\n",
-        "        super().__init__(task=task, eval_fn=eval_fn)\n",
-        "\n",
-        "    def prepare_task(self, sample: Example):\n",
-        "        return self.task.call, {\"question\": sample.question, \"id\": sample.id}\n",
-        "\n",
-        "    def prepare_eval(\n",
-        "        self, sample: Example, y_pred: adal.GeneratorOutput\n",
-        "    ) -> float:\n",
-        "        y_label = -1\n",
-        "        if (y_pred is not None and y_pred.data is not None):  # if y_pred and y_pred.data: might introduce bug when the data is 0\n",
-        "            y_label = y_pred.data\n",
-        "        return self.eval_fn, {\"y\": y_label, \"y_gt\": sample.answer}"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 1:   0%|          | 0/13 [00:00<?, ?it/s]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 133.39it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 1489.32it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 262.46it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12865.96it/s]\n",
+      "Training Step: 2:   8%|▊         | 1/13 [00:00<00:01,  8.29it/s]"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "eliPeVeM2wcP"
-      },
-      "outputs": [],
-      "source": [
-        "def diagnose(\n",
-        "    model_client: adal.ModelClient,\n",
-        "    model_kwargs: Dict,\n",
-        ") -> Dict:\n",
-        "\n",
-        "    trainset, valset, testset = load_datasets()\n",
-        "    # use max_samples=10 to test the code\n",
-        "\n",
-        "    adal_component = ObjectCountAdalComponent(model_client, model_kwargs)\n",
-        "    trainer = adal.Trainer(adaltask=adal_component)\n",
-        "    trainer.diagnose(dataset=trainset, split=\"train\")\n",
-        "    trainer.diagnose(dataset=valset, split=\"val\")\n",
-        "    trainer.diagnose(dataset=testset, split=\"test\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping batch 0 as acc: 1.0\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "nKl9clcb3dFj",
-        "outputId": "676fbb96-c70b-40ab-ea15-93ade1aa9e66"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
-            "Checkpoint path: /root/.adalflow/ckpt/ObjectCountAdalComponent\n",
-            "Save diagnose to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_train\n",
-            "Saving traces to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_train\n",
-            "all_generators: [('llm_counter', Generator(\n",
-            "  model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "))]\n",
-            "Registered callback for llm_counter, file path: /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_train/llm_counter_call.jsonl\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 5957.82it/s]\n",
-            "Evaluating step(0): 0.88 across 50 samples, Max potential: 0.88: 100%|██████████| 50/50 [00:15<00:00,  3.27it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sorted_indices: [8, 16, 23, 25, 31, 47, 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49]\n",
-            "sorted_scores: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]\n",
-            "Loading log file: llm_counter_call.jsonl\n",
-            "Total error samples: 6\n",
-            "Save diagnose to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_val\n",
-            "Saving traces to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_val\n",
-            "all_generators: [('llm_counter', Generator(\n",
-            "  model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "))]\n",
-            "Registered callback for llm_counter, file path: /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_val/llm_counter_call.jsonl\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3203.76it/s]\n",
-            "Evaluating step(0): 0.8 across 50 samples, Max potential: 0.8: 100%|██████████| 50/50 [00:15<00:00,  3.26it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sorted_indices: [1, 2, 5, 10, 24, 36, 38, 42, 44, 47, 0, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 40, 41, 43, 45, 46, 48, 49]\n",
-            "sorted_scores: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]\n",
-            "Loading log file: llm_counter_call.jsonl\n",
-            "Total error samples: 10\n",
-            "Save diagnose to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_test\n",
-            "Saving traces to /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_test\n",
-            "all_generators: [('llm_counter', Generator(\n",
-            "  model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "))]\n",
-            "Registered callback for llm_counter, file path: /root/.adalflow/ckpt/ObjectCountAdalComponent/diagnose_test/llm_counter_call.jsonl\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 5545.09it/s]\n",
-            "Evaluating step(0): 0.83 across 100 samples, Max potential: 0.83: 100%|██████████| 100/100 [00:28<00:00,  3.50it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sorted_indices: [7, 18, 19, 20, 23, 24, 25, 43, 58, 59, 63, 74, 75, 79, 85, 97, 99, 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 76, 77, 78, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 98]\n",
-            "sorted_scores: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]\n",
-            "Loading log file: llm_counter_call.jsonl\n",
-            "Total error samples: 17\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "diagnose(**gpt_3_model)"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 384.73it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 927.64it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 754.71it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12087.33it/s]\n",
+      "Training Step: 3:  15%|█▌        | 2/13 [00:00<00:01,  8.92it/s]"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dSu4VQri3y3D"
-      },
-      "source": [
-        "Now, you can go to `/content/adalflow/ckpt/ObjectCountAdalComponent/diagnose_train/stats.json` to view the average score for each split. And also the `diagnose.json` for different errors.\n",
-        "\n",
-        "Here is the overall score for each split.\n",
-        "\n",
-        "| Train  | Val| Test |\n",
-        "|:--------- |:--------:| ---------:|\n",
-        "| 0.88      | 0.8   |    0.83  |\n",
-        "\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping batch 1 as acc: 1.0\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1vzJyp-W0z7I"
-      },
-      "source": [
-        "## 🐛 Debug"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 193.44it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 2761.68it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 810.38it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11320.66it/s]\n",
+      "Training Step: 4:  15%|█▌        | 2/13 [00:00<00:01,  8.92it/s]"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "TmlCvJu804dJ"
-      },
-      "source": [
-        "## ✅ Train\n",
-        "\n",
-        "Now, let's start training."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping batch 2 as acc: 1.0\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "4TWCn0did6-K"
-      },
-      "outputs": [],
-      "source": [
-        "from adalflow.datasets.types import Example\n",
-        "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
-        "\n",
-        "\n",
-        "class ObjectCountAdalComponent(adal.AdalComponent):\n",
-        "    def __init__(\n",
-        "        self,\n",
-        "        model_client: adal.ModelClient,\n",
-        "        model_kwargs: Dict,\n",
-        "        backward_engine_model_config: Dict,\n",
-        "        teacher_model_config: Dict,\n",
-        "        text_optimizer_model_config: Dict,\n",
-        "    ):\n",
-        "        task = ObjectCountTaskPipeline(model_client, model_kwargs)\n",
-        "        eval_fn = AnswerMatchAcc(type=\"exact_match\").compute_single_item\n",
-        "        loss_fn = adal.EvalFnToTextLoss(\n",
-        "            eval_fn=eval_fn,\n",
-        "            eval_fn_desc=\"exact_match: 1 if str(y) == str(y_gt) else 0\",\n",
-        "        )\n",
-        "        super().__init__(task=task, eval_fn=eval_fn, loss_fn=loss_fn)\n",
-        "\n",
-        "        self.backward_engine_model_config = backward_engine_model_config\n",
-        "        self.teacher_model_config = teacher_model_config\n",
-        "        self.text_optimizer_model_config = text_optimizer_model_config\n",
-        "\n",
-        "    def prepare_task(self, sample: Example):\n",
-        "        return self.task.call, {\"question\": sample.question, \"id\": sample.id}\n",
-        "\n",
-        "\n",
-        "    def prepare_eval(\n",
-        "        self, sample: Example, y_pred: adal.GeneratorOutput\n",
-        "    ) -> float:\n",
-        "        y_label = -1\n",
-        "        if (y_pred is not None and y_pred.data is not None):  # if y_pred and y_pred.data: might introduce bug when the data is 0\n",
-        "            y_label = y_pred.data\n",
-        "        return self.eval_fn, {\"y\": y_label, \"y_gt\": sample.answer}\n",
-        "\n",
-        "    def prepare_loss(self, sample: Example, pred: adal.Parameter):\n",
-        "        # prepare gt parameter\n",
-        "        y_gt = adal.Parameter(\n",
-        "            name=\"y_gt\",\n",
-        "            data=sample.answer,\n",
-        "            eval_input=sample.answer,\n",
-        "            requires_opt=False,\n",
-        "        )\n",
-        "\n",
-        "        # pred's full_response is the output of the task pipeline which is GeneratorOutput\n",
-        "        pred.eval_input = pred.full_response.data\n",
-        "        return self.loss_fn, {\"kwargs\": {\"y\": pred, \"y_gt\": y_gt}}"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 234.44it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 2487.72it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 1024.88it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12018.06it/s]\n",
+      "Training Step: 5:  31%|███       | 4/13 [00:00<00:00, 11.90it/s]"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "dezwX2yn1eQS"
-      },
-      "outputs": [],
-      "source": [
-        "def train(\n",
-        "    train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle\n",
-        "    raw_shots: int = 0,\n",
-        "    bootstrap_shots: int = 1,\n",
-        "    max_steps=1,\n",
-        "    num_workers=4,\n",
-        "    strategy=\"random\",\n",
-        "    optimization_order=\"sequential\",\n",
-        "    debug=False,\n",
-        "    resume_from_ckpt=None,\n",
-        "    exclude_input_fields_from_bootstrap_demos=False,\n",
-        "):\n",
-        "    adal_component = ObjectCountAdalComponent(\n",
-        "        **gpt_3_model,\n",
-        "        teacher_model_config=gpt_4o_model,\n",
-        "        text_optimizer_model_config=gpt_4o_model,\n",
-        "        backward_engine_model_config=gpt_4o_model\n",
-        "    )\n",
-        "    print(adal_component)\n",
-        "    trainer = adal.Trainer(\n",
-        "        train_batch_size=train_batch_size,\n",
-        "        adaltask=adal_component,\n",
-        "        strategy=strategy,\n",
-        "        max_steps=max_steps,\n",
-        "        num_workers=num_workers,\n",
-        "        raw_shots=raw_shots,\n",
-        "        bootstrap_shots=bootstrap_shots,\n",
-        "        debug=debug,\n",
-        "        weighted_sampling=True,\n",
-        "        optimization_order=optimization_order,\n",
-        "        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,\n",
-        "    )\n",
-        "    print(trainer)\n",
-        "\n",
-        "    train_dataset, val_dataset, test_dataset = load_datasets()\n",
-        "    trainer.fit(\n",
-        "        train_dataset=train_dataset,\n",
-        "        val_dataset=val_dataset,\n",
-        "        test_dataset=test_dataset,\n",
-        "        debug=debug,\n",
-        "        resume_from_ckpt=resume_from_ckpt,\n",
-        "    )\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping batch 3 as acc: 1.0\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "NGKYozGt60Pp"
-      },
-      "source": [
-        "We use `Sequential` in default, we will end up with 24 steps in total, 12 for text optimizer and 12 for the demo optimizer."
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 133.95it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 4552.84it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 392.05it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 770.69it/s]\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yDwLwL0L7Rsw",
-        "outputId": "1b7e413b-a1d3-4388-fc0c-ca4b1c072585"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
-            "ObjectCountAdalComponent(\n",
-            "  eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
-            "  (task): ObjectCountTaskPipeline(\n",
-            "    (llm_counter): Generator(\n",
-            "      model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "      (prompt): Prompt(\n",
-            "        template: <START_OF_SYSTEM_PROMPT>\n",
-            "        {{system_prompt}}\n",
-            "        {# Few shot demos #}\n",
-            "        {% if few_shot_demos is not none %}\n",
-            "        Here are some examples:\n",
-            "        {{few_shot_demos}}\n",
-            "        {% endif %}\n",
-            "        <END_OF_SYSTEM_PROMPT>\n",
-            "        <START_OF_USER>\n",
-            "        {{input_str}}\n",
-            "        <END_OF_USER>\n",
-            "        , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "      )\n",
-            "      (model_client): OpenAIClient()\n",
-            "      (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "    )\n",
-            "  )\n",
-            "  (loss_fn): EvalFnToTextLoss()\n",
-            ")\n",
-            "Trainer(\n",
-            "  (adaltask): ObjectCountAdalComponent(\n",
-            "    eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
-            "    (task): ObjectCountTaskPipeline(\n",
-            "      (llm_counter): Generator(\n",
-            "        model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "        (prompt): Prompt(\n",
-            "          template: <START_OF_SYSTEM_PROMPT>\n",
-            "          {{system_prompt}}\n",
-            "          {# Few shot demos #}\n",
-            "          {% if few_shot_demos is not none %}\n",
-            "          Here are some examples:\n",
-            "          {{few_shot_demos}}\n",
-            "          {% endif %}\n",
-            "          <END_OF_SYSTEM_PROMPT>\n",
-            "          <START_OF_USER>\n",
-            "          {{input_str}}\n",
-            "          <END_OF_USER>\n",
-            "          , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "        )\n",
-            "        (model_client): OpenAIClient()\n",
-            "        (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "      )\n",
-            "    )\n",
-            "    (loss_fn): EvalFnToTextLoss()\n",
-            "  )\n",
-            ")\n",
-            "raw_shots: 0, bootstrap_shots: 1\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator configured.\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Backward engine configured for all generators.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 6482.70it/s]\n",
-            "Evaluating step(0): 0.8 across 50 samples, Max potential: 0.8: 100%|██████████| 50/50 [00:00<00:00, 347.01it/s]\n",
-            "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 2017.67it/s]\n",
-            "Evaluating step(0): 0.83 across 100 samples, Max potential: 0.83: 100%|██████████| 100/100 [00:00<00:00, 286.59it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Initial validation score: 0.8\n",
-            "Initial test score: 0.83\n",
-            "Checkpoint path: /root/.adalflow/ckpt/ObjectCountAdalComponent\n",
-            "save to /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 1:   0%|          | 0/13 [00:00<?, ?it/s]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 133.39it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 1489.32it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 262.46it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12865.96it/s]\n",
-            "Training Step: 2:   8%|▊         | 1/13 [00:00<00:01,  8.29it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Skipping batch 0 as acc: 1.0\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 384.73it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 927.64it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 754.71it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12087.33it/s]\n",
-            "Training Step: 3:  15%|█▌        | 2/13 [00:00<00:01,  8.92it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Skipping batch 1 as acc: 1.0\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 193.44it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 2761.68it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 810.38it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11320.66it/s]\n",
-            "Training Step: 4:  15%|█▌        | 2/13 [00:00<00:01,  8.92it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Skipping batch 2 as acc: 1.0\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 234.44it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 2487.72it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 1024.88it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12018.06it/s]\n",
-            "Training Step: 5:  31%|███       | 4/13 [00:00<00:00, 11.90it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Skipping batch 3 as acc: 1.0\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 133.95it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 4552.84it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 392.05it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 770.69it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.75\n",
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 0.0\n",
-            "Subset loss backward time: 5.383355617523193\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 225.14it/s]\n",
-            "Evaluating step(4): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  2.43it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 1.0 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 445.28it/s]\n",
-            "Evaluating step(4): 1.0 across 4 samples, Max potential: 1.0: 100%|██████████| 4/4 [00:01<00:00,  2.67it/s]\n",
-            "Proposing:   0%|          | 0/5 [00:03<?, ?it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 1.0 >= 0.75\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1139.66it/s]\n",
-            "Evaluating step(5): 0.84 across 50 samples, Max potential: 0.84: 100%|██████████| 50/50 [00:16<00:00,  3.04it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer step: 0.84 > 0.8\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 1658.72it/s]\n",
-            "Evaluating step(4): 0.91 across 100 samples, Max potential: 0.91: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]\n",
-            "Training Step: 6:  38%|███▊      | 5/13 [00:56<02:18, 17.27s/it]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 207.97it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.86it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 494.99it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 805.09it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.75\n",
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "Subset loss backward time: 4.081957817077637\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 538.35it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00,  3.13it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:08,  2.13s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 151.18it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 204.61it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.66s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 698.62it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 571.41it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:05<00:03,  1.61s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 116.83it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.50it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:07<00:01,  1.88s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 399.65it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 571.09it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:08<00:00,  1.69s/it]\n",
-            "Training Step: 7:  46%|████▌     | 6/13 [01:09<01:53, 16.18s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 59.06it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 410.78it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 4694.24it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 7\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.875\n",
-            "Moving batch correct size: 7\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "Subset loss backward time: 3.0843119621276855\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 154.50it/s]\n",
-            "Evaluating step(6): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  1.52it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 1.0 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 8/8 [00:00<00:00, 279.47it/s]\n",
-            "Evaluating step(6): 0.875 across 8 samples, Max potential: 0.875: 100%|██████████| 8/8 [00:01<00:00,  4.43it/s]\n",
-            "Proposing:   0%|          | 0/5 [00:04<?, ?it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 0.875 >= 0.875\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2336.58it/s]\n",
-            "Evaluating step(7): 0.84 across 50 samples, Max potential: 0.84: 100%|██████████| 50/50 [00:17<00:00,  2.88it/s]\n",
-            "Training Step: 8:  54%|█████▍    | 7/13 [01:37<01:58, 19.81s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.84 <= 0.84\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 148.75it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.04it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 345.11it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 7550.50it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 11\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.9166666666666666\n",
-            "Moving batch correct size: 11\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "Subset loss backward time: 2.337067127227783\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 193.84it/s]\n",
-            "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.16it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:09,  2.39s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified in words. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 147.89it/s]\n",
-            "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.04it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:04<00:07,  2.41s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 423.61it/s]\n",
-            "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 556.86it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:05<00:03,  1.78s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 532.41it/s]\n",
-            "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 522.78it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:06<00:01,  1.44s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified in words. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 284.18it/s]\n",
-            "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 160.35it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.59s/it]\n",
-            "Training Step: 9:  62%|██████▏   | 8/13 [01:50<01:27, 17.55s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 87.73it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.62it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 342.85it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 7157.52it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 14\n",
-            "Moving batch error size: 2\n",
-            "Moving batch acc: 0.875\n",
-            "Moving batch correct size: 14\n",
-            "Moving batch error size: 2\n",
-            "Subset Error size: 2\n",
-            "Subset Correct size: 4\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_0 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "Subset loss backward time: 7.823317050933838\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 166.50it/s]\n",
-            "Evaluating step(8): 0.8333 across 6 samples, Max potential: 0.8333: 100%|██████████| 6/6 [00:02<00:00,  2.78it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 0.8333333333333334 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 16/16 [00:00<00:00, 481.75it/s]\n",
-            "Evaluating step(8): 0.875 across 16 samples, Max potential: 0.875: 100%|██████████| 16/16 [00:03<00:00,  5.21it/s]\n",
-            "Proposing:   0%|          | 0/5 [00:06<?, ?it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 0.875 >= 0.875\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1112.82it/s]\n",
-            "Evaluating step(9): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:16<00:00,  2.97it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer step: 0.86 > 0.84\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 2395.58it/s]\n",
-            "Evaluating step(8): 0.87 across 100 samples, Max potential: 0.87: 100%|██████████| 100/100 [00:30<00:00,  3.30it/s]\n",
-            "Training Step: 10:  69%|██████▉   | 9/13 [02:52<02:04, 31.23s/it]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 212.83it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.04it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 655.18it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1241.84it/s]\n",
-            "Training Step: 11:  77%|███████▋  | 10/13 [02:55<01:07, 22.43s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Skipping batch 9 as acc: 1.0\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.95it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.23it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 757.71it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1320.62it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.75\n",
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "Subset loss backward time: 3.768970012664795\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 125.10it/s]\n",
-            "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.77it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:08,  2.19s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 571.28it/s]\n",
-            "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 429.07it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.58s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to categories and quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 111.64it/s]\n",
-            "Evaluating step(10): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  2.63it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 1.0 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 455.77it/s]\n",
-            "Evaluating step(10): 1.0 across 4 samples, Max potential: 1.0: 100%|██████████| 4/4 [00:00<00:00,  5.14it/s]\n",
-            "Proposing:  40%|████      | 2/5 [00:06<00:09,  3.17s/it]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 1.0 >= 0.75\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1732.93it/s]\n",
-            "Evaluating step(11): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:18<00:04,  2.21it/s]\n",
-            "Training Step: 12:  85%|████████▍ | 11/13 [03:24<00:49, 24.61s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.8048780487804879 <= 0.86\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 128.86it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.24it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 470.20it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2608.40it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 6\n",
-            "Moving batch error size: 2\n",
-            "Moving batch acc: 0.75\n",
-            "Moving batch correct size: 6\n",
-            "Moving batch error size: 2\n",
-            "Subset Error size: 2\n",
-            "Subset Correct size: 4\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "Subset loss backward time: 6.722561836242676\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 265.78it/s]\n",
-            "Evaluating step(11): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:01<00:00,  3.58it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:10,  2.65s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 396.33it/s]\n",
-            "Evaluating step(11): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 354.51it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:03<00:05,  1.80s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 789.39it/s]\n",
-            "Evaluating step(11): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 233.79it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.49s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each relevant item, excluding any that do not fit the category. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 181.12it/s]\n",
-            "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:02<00:00,  2.13it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:08<00:02,  2.44s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each relevant item, excluding any that do not fit the category. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 807.04it/s]\n",
-            "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 275.78it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:10<00:00,  2.01s/it]\n",
-            "Training Step: 12:  92%|█████████▏| 12/13 [03:43<00:18, 18.61s/it]\n",
-            "Epoch: 100%|██████████| 1/1 [03:43<00:00, 223.37s/it]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n",
-            "Reached max steps\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            ")\n",
-            "Teacher generator configured.\n",
-            "save to /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
-            "Starting step: 12\n",
-            "trainer_results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 13:   0%|          | 0/12 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 13\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 158.10it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 6c34d6e5-0e3d-4243-834e-fd6c5883f467 already exists. Updating the trace.Trace with id 234e39df-1bc4-41df-a515-895cb2614a53 already exists. Updating the trace.\n",
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.35it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 8895d6bd-eab0-48af-ad4b-51f8007258b1 already exists. Updating the trace.\n",
-            "Trace with id c42fea48-1b90-4388-92c4-b65b4356a3a2 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 490.46it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1656.19it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 247.40it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.77it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 365.97it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9294.86it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['c42fea48-1b90-4388-92c4-b65b4356a3a2']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Trombone: 1\\n\\n  2. Violin: 1\\n\\n  3. Clarinet: 1\\n\\n  4. Accordion: 1\\n\\n  5. Flutes: 4\\n\\n  6. Trumpet: 1\\n\\n  7. Drums: 2\\n\\n  8. Piano: 1\\n\\n\\n  Now, let''s add them up:\\n\\n\\n  1 + 1 + 1 + 1 + 4 + 1 + 2 + 1 = 12\\n\\n\\n  Answer: 12'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2578.13it/s]\n",
-            "Evaluating step(13): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:23<00:13,  1.35it/s]\n",
-            "Training Step: 14:  17%|█▋        | 2/12 [00:27<04:35, 27.54s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
-            "Training Step: 14\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 136.94it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id fd34672a-ffd1-498e-a88f-283aa9d4f65d already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.92it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 46a8994f-fce6-4031-b251-1c8af31d88d2 already exists. Updating the trace.\n",
-            "Trace with id 2bc992c0-9832-47f1-87c3-9f6e4b18ee99 already exists. Updating the trace.\n",
-            "Trace with id 12a6ff3d-f54d-4d89-b5f0-1aec30e96398 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 443.10it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 3302.60it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 114.14it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.59it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 685.93it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 5111.89it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1784.60it/s]\n",
-            "Evaluating step(14): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:19<00:11,  1.61it/s]\n",
-            "Training Step: 15:  33%|███▎      | 4/12 [00:52<02:10, 16.36s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
-            "Training Step: 15\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 164.67it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 660c5004-35d2-4a6d-9a06-1e0b3f032f21 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training:  25%|██▌       | 1/4 [00:00<00:02,  1.12it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d3f33ded-170a-4b87-9b0b-987d5fb7b817 already exists. Updating the trace.\n",
-            "Trace with id de4e75d6-a21b-4004-925d-a9a818bd0f7c already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.02it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 1f682cab-026c-4803-8018-a45d027aa026 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 665.05it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1875.18it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 160.86it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.14it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 621.42it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9054.08it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2982.93it/s]\n",
-            "Evaluating step(15): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 495.97it/s]\n",
-            "Training Step: 16:  42%|████▏     | 5/12 [00:56<01:03,  9.03s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
-            "Training Step: 16\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 127.68it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 0e8910c8-703d-4766-a483-c5691125fd03 already exists. Updating the trace.Trace with id ffe67a7b-7b81-4302-b6ed-4b506570274b already exists. Updating the trace.\n",
-            "Trace with id e250f80e-334e-4f85-ac1f-df9a2013d578 already exists. Updating the trace.\n",
-            "Trace with id fe9b883c-4f47-44f7-a388-b03a2fb10413 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 534.68it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 201.71it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10453.09it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 195.85it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.52it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 560.49it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1250.72it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3902.04it/s]\n",
-            "Evaluating step(16): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 398.91it/s]\n",
-            "Training Step: 17:  58%|█████▊    | 7/12 [00:58<00:35,  7.16s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
-            "Training Step: 17\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data:   0%|          | 0/4 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d46e538c-832d-4eb5-ba9b-a308f666baba already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rLoading Data: 100%|██████████| 4/4 [00:00<00:00, 106.99it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id a9a202f5-e723-4d24-ae5e-ad1084a52ef8 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training:  75%|███████▌  | 3/4 [00:00<00:00,  3.09it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 74d1bc97-46cd-406d-8c3a-2f999aae1b2f already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.92it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 334.77it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 874.86it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 370.55it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.81it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 482.84it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 645.40it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2888.08it/s]\n",
-            "Evaluating step(17): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 221.76it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 18:  67%|██████▋   | 8/12 [01:02<00:19,  4.87s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 18\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 111.28it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d4194dd1-739a-4509-8ac8-7c3f89649ee7 already exists. Updating the trace.Trace with id 1eb770ed-ff6f-481e-8c16-b9749a44a1a6 already exists. Updating the trace.\n",
-            "Trace with id 7694df14-3a24-40bd-a3fa-036c2645eca3 already exists. Updating the trace.\n",
-            "\n",
-            "Trace with id 4cd9f4ec-2648-4e85-8e17-3dae1b8558d3 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 585.96it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 225.18it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1038.07it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 250.95it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.18it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 438.82it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2456.40it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2422.27it/s]\n",
-            "Evaluating step(18): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 456.47it/s]\n",
-            "Training Step: 19:  75%|███████▌  | 9/12 [01:05<00:13,  4.41s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
-            "Training Step: 19\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 57.52it/s]\n",
-            "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 5124e2e6-2aac-4dd3-ab63-9277a7b806a7 already exists. Updating the trace.\n",
-            "Trace with id 1d3eceeb-ad24-40f6-8752-2f38241172cb already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.16it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 3a9a47c8-a210-43a4-8d24-b9159babb6e4 already exists. Updating the trace.Trace with id 6c0d3a9a-bb01-4fb3-a68b-1edf66861235 already exists. Updating the trace.\n",
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 193.38it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 6143.25it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 107.12it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.42it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 375.70it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10505.46it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3644.75it/s]\n",
-            "Evaluating step(19): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 275.17it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 20:  92%|█████████▏| 11/12 [01:09<00:04,  4.32s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 20\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 125.16it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id b538075d-01af-4b76-b835-9005f3044609 already exists. Updating the trace.\n",
-            "Trace with id dd9d8748-4926-4bcd-902d-6a4c5cb38267 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 60866bed-8020-4610-a39a-a4a730c035db already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00,  4.20it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 85d63f78-39c0-4753-a9fc-52202df48673 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 328.35it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 999.36it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 239.24it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.87it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 353.26it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 391.07it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['60866bed-8020-4610-a39a-a4a730c035db']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Bed: 1\\n\\n  2. Fridge: 1\\n\\n  3. Lamp: 1\\n\\n  4. Toaster: 1\\n\\n  5. Chairs: 4\\n\\n  6. Table: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 1 + 1 + 4 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1763.23it/s]\n",
-            "Evaluating step(20): 0.7083 across 24 samples, Max potential: 0.86:  48%|████▊     | 24/50 [00:17<00:18,  1.38it/s]\n",
-            "Training Step: 21: 100%|██████████| 12/12 [01:34<00:00,  7.82s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.68 <= 0.86, revert\n",
-            "Training Step: 21\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 208.10it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id aefd17e5-9682-4420-a820-c484a63d6dcd already exists. Updating the trace.\n",
-            "Trace with id 04e77795-cc9b-4530-a883-5f775e3fbc76 already exists. Updating the trace.\n",
-            "Trace with id 701be0ee-29e0-42f5-be04-72d2b73e3968 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00,  4.56it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 433650a5-ca75-4867-b235-3af4a7c55c67 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 187.26it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2595.49it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 129.91it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.86it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 172.30it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 689.23it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3183.48it/s]\n",
-            "Evaluating step(21): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 438.75it/s]\n",
-            "Training Step: 22: : 13it [01:38,  6.76s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
-            "Training Step: 22\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data:   0%|          | 0/4 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 4dad0f65-d624-48c2-a795-596c00b0535a already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 86.81it/s]\n",
-            "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 1b4b3ab0-d20f-4fc2-a09c-4592a227a8e5 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.23it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 82cf82ff-d826-4bb1-847c-9938aeec8ff5 already exists. Updating the trace.\n",
-            "Trace with id ac43f3d4-d67d-4912-95d6-0baa09b52d9a already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 143.58it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 842.95it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 133.83it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.36it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 326.14it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 307.38it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['60866bed-8020-4610-a39a-a4a730c035db']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Bed: 1\\n\\n  2. Fridge: 1\\n\\n  3. Lamp: 1\\n\\n  4. Toaster: 1\\n\\n  5. Chairs: 4\\n\\n  6. Table: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 1 + 1 + 4 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 5440.79it/s]\n",
-            "Evaluating step(22): 0.7083 across 24 samples, Max potential: 0.86:  48%|████▊     | 24/50 [00:00<00:00, 303.26it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.68 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 23: : 14it [01:42,  6.13s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 23\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 91.93it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id daa5804f-1aad-4f01-b26c-6b31c57f065f already exists. Updating the trace.\n",
-            "Trace with id e2bfbbe0-fb79-4df5-9a7d-50c9085947bc already exists. Updating the trace.\n",
-            "Trace with id 71d549d2-9cc8-46ba-a7f6-d07f69263fd3 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.56it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 840d9ed5-8222-45a9-a406-7445feae9733 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 63.89it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 201.47it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 90.61it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.96it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 287.69it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1938.89it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3010.90it/s]\n",
-            "Evaluating step(23): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 423.98it/s]\n",
-            "Training Step: 24: : 16it [01:48,  6.22s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
-            "Training Step: 24\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 122.52it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 96c716a1-e984-4fe3-9ce0-e156ac709edb already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 000a3738-1f09-40b0-9f8b-2dec63a3f7f8 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.21it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d71ad721-d21d-42f1-af9b-719ff026406b already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 106.06it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1513.37it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 265.42it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.07it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 171.27it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 862.32it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['840d9ed5-8222-45a9-a406-7445feae9733']\n",
-            "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Table: 1\\n\\n  3. Fridge: 1\\n\\n  4. Stove: 1\\n\\n  5. Oven: 1\\n\\n  6. Toaster: 1\\n\\n  7. Couch: 1\\n\\n  8. Cars: 4\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 1 + 1 + 1 + 1 + 1 + 4 = 11\\n\\n\\n  Answer: 11'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1210.01it/s]\n",
-            "Evaluating step(24): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:18<00:00,  2.69it/s]\n",
-            "Training Step: 24: 100%|██████████| 12/12 [02:15<00:00, 11.26s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.86 <= 0.86, revert\n",
-            "Saved ckpt to /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
-            "Training time: 359.32386112213135s\n",
-            "ckpt_file: /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "train(debug=False, max_steps=12, strategy=\"constrained\",\n",
-        "      raw_shots=0, bootstrap_shots=1,\n",
-        "      exclude_input_fields_from_bootstrap_demos=True\n",
-        "      )"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.75\n",
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 0.0\n",
+      "Subset loss backward time: 5.383355617523193\n",
+      "Optimizer propose...\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KAyFhzrG_J4l"
-      },
-      "source": [
-        "Here is our scores for each step:\n",
-        "\n",
-        "\"val_scores\": [\n",
-        "        0.8,\n",
-        "        0.8,\n",
-        "        0.8,\n",
-        "        0.8,\n",
-        "        0.8,\n",
-        "        0.84,\n",
-        "        0.84,\n",
-        "        0.84,\n",
-        "        0.84,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86,\n",
-        "        0.86\n",
-        "    ]\n",
-        "\n",
-        "  \"test_scores\": [\n",
-        "        0.83,\n",
-        "        0.83,\n",
-        "        0.83,\n",
-        "        0.83,\n",
-        "        0.83,\n",
-        "        0.91,\n",
-        "        0.91,\n",
-        "        0.91,\n",
-        "        0.91,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87,\n",
-        "        0.87\n",
-        "    ]\n",
-        "\n",
-        "\n",
-        "It is normal when the score of the validation does not exactly match to that of the test set. You can also train with just the test set. You can modify the fit arguments as\n",
-        "\n",
-        "```\n",
-        "trainer.fit(\n",
-        "        train_dataset=train_dataset,\n",
-        "        val_dataset=test_dataset,\n",
-        "        # test_dataset=test_dataset,\n",
-        "        debug=debug,\n",
-        "        resume_from_ckpt=resume_from_ckpt,\n",
-        "    )\n",
-        "```"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "953BV81y0JFv"
-      },
-      "source": [
-        "# 🔥 Resume Checkpoint\n",
-        "\n",
-        "We might want to continue from the earlier step and to train more steps\n",
-        "\n",
-        "This is easy to do.\n",
-        "\n",
-        "**Note: Ensure you copy the path you had, and replace it, as your run might create a different file name.**"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "kde1V1AE7Ty0",
-        "outputId": "52d69b69-0a3a-4780-ca26-25956cc023c7"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
-            "ObjectCountAdalComponent(\n",
-            "  eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
-            "  (task): ObjectCountTaskPipeline(\n",
-            "    (llm_counter): Generator(\n",
-            "      model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "      (prompt): Prompt(\n",
-            "        template: <START_OF_SYSTEM_PROMPT>\n",
-            "        {{system_prompt}}\n",
-            "        {# Few shot demos #}\n",
-            "        {% if few_shot_demos is not none %}\n",
-            "        Here are some examples:\n",
-            "        {{few_shot_demos}}\n",
-            "        {% endif %}\n",
-            "        <END_OF_SYSTEM_PROMPT>\n",
-            "        <START_OF_USER>\n",
-            "        {{input_str}}\n",
-            "        <END_OF_USER>\n",
-            "        , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "      )\n",
-            "      (model_client): OpenAIClient()\n",
-            "      (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "    )\n",
-            "  )\n",
-            "  (loss_fn): EvalFnToTextLoss()\n",
-            ")\n",
-            "Trainer(\n",
-            "  (adaltask): ObjectCountAdalComponent(\n",
-            "    eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
-            "    (task): ObjectCountTaskPipeline(\n",
-            "      (llm_counter): Generator(\n",
-            "        model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "        (prompt): Prompt(\n",
-            "          template: <START_OF_SYSTEM_PROMPT>\n",
-            "          {{system_prompt}}\n",
-            "          {# Few shot demos #}\n",
-            "          {% if few_shot_demos is not none %}\n",
-            "          Here are some examples:\n",
-            "          {{few_shot_demos}}\n",
-            "          {% endif %}\n",
-            "          <END_OF_SYSTEM_PROMPT>\n",
-            "          <START_OF_USER>\n",
-            "          {{input_str}}\n",
-            "          <END_OF_USER>\n",
-            "          , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "        )\n",
-            "        (model_client): OpenAIClient()\n",
-            "        (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "      )\n",
-            "    )\n",
-            "    (loss_fn): EvalFnToTextLoss()\n",
-            "  )\n",
-            ")\n",
-            "raw_shots: 0, bootstrap_shots: 1\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator configured.\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Backward engine configured for all generators.\n",
-            "Restoring prompts: PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True)\n",
-            "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 27:   0%|          | 0/13 [00:00<?, ?it/s]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 417.64it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 1073.40it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 571.14it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1126.21it/s]\n",
-            "Training Step: 28:   0%|          | 0/13 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Skipping batch 0 as acc: 1.0\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 604.56it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.83it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 540.00it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1445.81it/s]\n",
-            "Training Step: 29:  15%|█▌        | 2/13 [00:02<00:12,  1.15s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Skipping batch 1 as acc: 1.0\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 318.87it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00,  4.06it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 458.88it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1186.26it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.75\n",
-            "Moving batch correct size: 3\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "Subset loss backward time: 4.518843650817871\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 142.52it/s]\n",
-            "Evaluating step(2): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  1.56it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:11,  2.99s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities specified in the input. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 118.95it/s]\n",
-            "Evaluating step(2): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  1.76it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:05<00:08,  2.85s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 129.26it/s]\n",
-            "Evaluating step(2): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  1.54it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 1.0 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 263.51it/s]\n",
-            "Evaluating step(2): 1.0 across 4 samples, Max potential: 1.0: 100%|██████████| 4/4 [00:00<00:00,  4.20it/s]\n",
-            "Proposing:  40%|████      | 2/5 [00:10<00:15,  5.11s/it]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 1.0 >= 0.75\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2287.37it/s]\n",
-            "Evaluating step(29): 0.8158 across 38 samples, Max potential: 0.86:  76%|███████▌  | 38/50 [00:17<00:05,  2.17it/s]\n",
-            "Training Step: 30:  23%|██▎       | 3/13 [00:35<02:25, 14.59s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.7948717948717948 <= 0.86\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 268.93it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.69it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 603.76it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 8825.47it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 7\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.875\n",
-            "Moving batch correct size: 7\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "Subset loss backward time: 2.2182435989379883\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure you account for all items. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 160.12it/s]\n",
-            "Evaluating step(3): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  1.72it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:11,  2.83s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure each item is counted correctly. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 89.23it/s]\n",
-            "Evaluating step(3): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 1.0 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 8/8 [00:00<00:00, 281.73it/s]\n",
-            "Evaluating step(3): 1.0 across 8 samples, Max potential: 1.0: 100%|██████████| 8/8 [00:02<00:00,  2.96it/s]\n",
-            "Proposing:  20%|██        | 1/5 [00:08<00:34,  8.54s/it]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 1.0 >= 0.875\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1910.10it/s]\n",
-            "Evaluating step(30): 0.72 across 25 samples, Max potential: 0.86:  50%|█████     | 25/50 [00:18<00:18,  1.38it/s]\n",
-            "Training Step: 31:  31%|███       | 4/13 [01:05<03:03, 20.39s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.6923076923076923 <= 0.86\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 310.31it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.75it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 454.32it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12336.19it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 11\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.9166666666666666\n",
-            "Moving batch correct size: 11\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "Subset loss backward time: 2.028568983078003\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure each item is counted correctly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 121.52it/s]\n",
-            "Evaluating step(4): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  2.10it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 1.0 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 12/12 [00:00<00:00, 724.90it/s]\n",
-            "Evaluating step(4): 1.0 across 12 samples, Max potential: 1.0: 100%|██████████| 12/12 [00:03<00:00,  3.66it/s]\n",
-            "Proposing:   0%|          | 0/5 [00:05<?, ?it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 1.0 >= 0.9166666666666666\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2233.56it/s]\n",
-            "Evaluating step(31): 0.8511 across 47 samples, Max potential: 0.86:  94%|█████████▍| 47/50 [00:16<00:01,  2.81it/s]\n",
-            "Training Step: 32:  38%|███▊      | 5/13 [01:31<02:58, 22.30s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.8333333333333334 <= 0.86\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 269.31it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.20it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 606.49it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1212.58it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 15\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.9375\n",
-            "Moving batch correct size: 15\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "Subset loss backward time: 3.2150633335113525\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 130.57it/s]\n",
-            "Evaluating step(5): 0.5 across 2 samples, Max potential: 0.6667:  33%|███▎      | 1/3 [00:01<00:02,  1.39s/it]INFO:backoff:Backing off call(...) for 0.2s (openai.InternalServerError: <html>\n",
-            "<head><title>500 Internal Server Error</title></head>\n",
-            "<body>\n",
-            "<center><h1>500 Internal Server Error</h1></center>\n",
-            "<hr><center>nginx</center>\n",
-            "</body>\n",
-            "</html>)\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:50<00:00, 16.89s/it]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:52<03:28, 52.11s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 645.05it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 298.94it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:53<01:07, 22.46s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 751.40it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 360.88it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:54<00:25, 12.66s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 332.13it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 276.08it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:55<00:08,  8.12s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 440.13it/s]\n",
-            "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 235.96it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:57<00:00, 11.41s/it]\n",
-            "Training Step: 33:  46%|████▌     | 6/13 [02:33<04:07, 35.35s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 317.05it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 676.47it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 543.36it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1518.44it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 18\n",
-            "Moving batch error size: 2\n",
-            "Moving batch acc: 0.9\n",
-            "Moving batch correct size: 18\n",
-            "Moving batch error size: 2\n",
-            "Subset Error size: 2\n",
-            "Subset Correct size: 4\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "Subset loss backward time: 7.857504606246948\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 282.66it/s]\n",
-            "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:02<00:00,  2.75it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:03<00:13,  3.26s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 687.22it/s]\n",
-            "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 539.26it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:04<00:06,  2.16s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 608.62it/s]\n",
-            "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 246.48it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:05<00:03,  1.68s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 417.60it/s]\n",
-            "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 422.96it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:07<00:01,  1.58s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 464.91it/s]\n",
-            "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 269.93it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:08<00:00,  1.67s/it]\n",
-            "Training Step: 34:  54%|█████▍    | 7/13 [02:49<02:55, 29.23s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 104.68it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.42it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 556.85it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14230.04it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 22\n",
-            "Moving batch error size: 2\n",
-            "Moving batch acc: 0.9\n",
-            "Moving batch correct size: 18\n",
-            "Moving batch error size: 2\n",
-            "Subset Error size: 2\n",
-            "Subset Correct size: 4\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "Subset loss backward time: 6.2225048542022705\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 584.16it/s]\n",
-            "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:01<00:00,  4.41it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:10,  2.54s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 943.25it/s]\n",
-            "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 367.37it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.65s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 802.76it/s]\n",
-            "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 290.57it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.44s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 736.81it/s]\n",
-            "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 352.92it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:05<00:01,  1.31s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 596.84it/s]\n",
-            "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 250.75it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.45s/it]\n",
-            "Training Step: 35:  62%|██████▏   | 8/13 [03:04<02:04, 24.82s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 70.79it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.78it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 388.55it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2027.46it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 22\n",
-            "Moving batch error size: 2\n",
-            "Moving batch acc: 0.9\n",
-            "Moving batch correct size: 18\n",
-            "Moving batch error size: 2\n",
-            "Subset Error size: 2\n",
-            "Subset Correct size: 4\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "Subset loss backward time: 5.618266582489014\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 304.00it/s]\n",
-            "Evaluating step(8): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:02<00:00,  2.79it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:03<00:13,  3.44s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure each item is counted correctly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 128.97it/s]\n",
-            "Evaluating step(8): 1.0 across 6 samples, Max potential: 1.0: 100%|██████████| 6/6 [00:01<00:00,  3.62it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass subset check: 1.0 > 0.6666666666666666\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 20/20 [00:00<00:00, 649.93it/s]\n",
-            "Evaluating step(8): 0.95 across 20 samples, Max potential: 0.95: 100%|██████████| 20/20 [00:02<00:00,  8.93it/s]\n",
-            "Proposing:  20%|██        | 1/5 [00:08<00:35,  8.79s/it]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass full check: 0.95 >= 0.9\n",
-            "Done with proposals\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2667.62it/s]\n",
-            "Evaluating step(35): 0.8511 across 47 samples, Max potential: 0.86:  94%|█████████▍| 47/50 [00:00<00:00, 559.52it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.8333333333333334 <= 0.86\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 36:  69%|██████▉   | 9/13 [03:21<01:29, 22.39s/it]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 154.85it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.33it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 610.06it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1798.78it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 22\n",
-            "Moving batch error size: 2\n",
-            "Moving batch acc: 0.95\n",
-            "Moving batch correct size: 19\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "Subset loss backward time: 2.553833246231079\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 228.47it/s]\n",
-            "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.44it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:09,  2.47s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 700.57it/s]\n",
-            "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 207.56it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:03<00:05,  1.69s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 782.91it/s]\n",
-            "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 712.51it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.49s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 269.05it/s]\n",
-            "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 266.32it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:06<00:01,  1.40s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 466.64it/s]\n",
-            "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 498.14it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.48s/it]\n",
-            "Training Step: 37:  77%|███████▋  | 10/13 [03:33<00:56, 18.97s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 115.54it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.77it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 561.81it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1002.40it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 23\n",
-            "Moving batch error size: 1\n",
-            "Moving batch acc: 0.95\n",
-            "Moving batch correct size: 19\n",
-            "Moving batch error size: 1\n",
-            "Subset Error size: 1\n",
-            "Subset Correct size: 2\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "Subset loss backward time: 2.35148024559021\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 139.22it/s]\n",
-            "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00,  3.95it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:01<00:07,  1.81s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 277.60it/s]\n",
-            "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 561.39it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:02<00:04,  1.42s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 736.01it/s]\n",
-            "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 168.63it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:03<00:02,  1.24s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 441.77it/s]\n",
-            "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 518.09it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:05<00:01,  1.19s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 396.70it/s]\n",
-            "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 199.84it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:06<00:00,  1.27s/it]\n",
-            "Training Step: 38:  85%|████████▍ | 11/13 [03:43<00:32, 16.20s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 138.49it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00,  6.41it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 610.01it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10665.74it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Moving batch correct size: 22\n",
-            "Moving batch error size: 2\n",
-            "Moving batch acc: 0.9\n",
-            "Moving batch correct size: 18\n",
-            "Moving batch error size: 2\n",
-            "Subset Error size: 2\n",
-            "Subset Correct size: 4\n",
-            "Subset score: 0.6666666666666666\n",
-            "Subset batch acc: 0.6666666666666666\n",
-            "Subset loss backward...\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "Subset loss backward time: 11.797855138778687\n",
-            "Optimizer propose...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 221.09it/s]\n",
-            "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:01<00:00,  4.45it/s]\n",
-            "\n",
-            "Proposing:  20%|██        | 1/5 [00:02<00:09,  2.46s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 690.80it/s]\n",
-            "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 309.16it/s]\n",
-            "\n",
-            "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.61s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 488.13it/s]\n",
-            "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 365.81it/s]\n",
-            "\n",
-            "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.36s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 693.52it/s]\n",
-            "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 272.61it/s]\n",
-            "\n",
-            "Proposing:  80%|████████  | 4/5 [00:05<00:01,  1.35s/it]\u001b[A"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 767.58it/s]\n",
-            "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 719.89it/s]\n",
-            "\n",
-            "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.41s/it]\n",
-            "Training Step: 38:  92%|█████████▏| 12/13 [04:02<00:20, 20.21s/it]\n",
-            "Epoch: 100%|██████████| 1/1 [04:02<00:00, 242.58s/it]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
-            "Done with proposals\n",
-            "No proposal can improve the subset and full set, go to next step\n",
-            "Reached max steps\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            ")\n",
-            "Teacher generator configured.\n",
-            "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
-            "Starting step: 38\n",
-            "trainer_results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 39:   0%|          | 0/12 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 39\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 161.31it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 54e272c5-1360-462e-b773-4c58c61472ee already exists. Updating the trace.\n",
-            "Trace with id 0e8910c8-703d-4766-a483-c5691125fd03 already exists. Updating the trace.\n",
-            "Trace with id 6c0d3a9a-bb01-4fb3-a68b-1edf66861235 already exists. Updating the trace.\n",
-            "Trace with id fe9b883c-4f47-44f7-a388-b03a2fb10413 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 812.53it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 2283.86it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11023.14it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 294.28it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.11it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 485.47it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11015.90it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['0e8910c8-703d-4766-a483-c5691125fd03']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Fridge\\n\\n  2. Chair\\n\\n  3. Bed\\n\\n  4. Oven\\n\\n  5. Microwave\\n\\n  6. Car\\n\\n\\n  There are 6 objects in total.\\n\\n\\n  Answer: 6'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3194.64it/s]\n",
-            "Evaluating step(39): 0.6818 across 22 samples, Max potential: 0.86:  44%|████▍     | 22/50 [00:15<00:19,  1.45it/s]\n",
-            "Training Step: 40:  17%|█▋        | 2/12 [00:17<02:58, 17.85s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.6521739130434783 <= 0.86, revert\n",
-            "Training Step: 40\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 697.57it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id daa5804f-1aad-4f01-b26c-6b31c57f065f already exists. Updating the trace.\n",
-            "Trace with id 71d549d2-9cc8-46ba-a7f6-d07f69263fd3 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 562.43it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id b538075d-01af-4b76-b835-9005f3044609 already exists. Updating the trace.Trace with id fd34672a-ffd1-498e-a88f-283aa9d4f65d already exists. Updating the trace.\n",
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 577.17it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9709.04it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 142.07it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.41it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 311.77it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 713.44it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3973.84it/s]\n",
-            "Evaluating step(40): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 440.54it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 41:  33%|███▎      | 4/12 [00:22<00:49,  6.19s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 41\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 155.20it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 840d9ed5-8222-45a9-a406-7445feae9733 already exists. Updating the trace.\n",
-            "Trace with id 4cd9f4ec-2648-4e85-8e17-3dae1b8558d3 already exists. Updating the trace.\n",
-            "Trace with id ac43f3d4-d67d-4912-95d6-0baa09b52d9a already exists. Updating the trace.\n",
-            "Trace with id 1eb770ed-ff6f-481e-8c16-b9749a44a1a6 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 1098.13it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 521.96it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10292.77it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 172.25it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.39it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 587.31it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1397.05it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3444.16it/s]\n",
-            "Evaluating step(41): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 318.28it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 42:  42%|████▏     | 5/12 [00:24<00:25,  3.71s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 42\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 268.35it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 4dad0f65-d624-48c2-a795-596c00b0535a already exists. Updating the trace.\n",
-            "Trace with id dd9d8748-4926-4bcd-902d-6a4c5cb38267 already exists. Updating the trace.\n",
-            "Trace with id 1f682cab-026c-4803-8018-a45d027aa026 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 522.44it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 344.49it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14755.69it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.06it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.03it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 454.94it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 5319.35it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4718.96it/s]\n",
-            "Evaluating step(42): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 404.64it/s]\n",
-            "Training Step: 43:  58%|█████▊    | 7/12 [00:27<00:17,  3.51s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n",
-            "Training Step: 43\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 261.59it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 46a8994f-fce6-4031-b251-1c8af31d88d2 already exists. Updating the trace.Trace with id 3a9a47c8-a210-43a4-8d24-b9159babb6e4 already exists. Updating the trace.\n",
-            "\n",
-            "Trace with id 234e39df-1bc4-41df-a515-895cb2614a53 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 428.10it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id de4e75d6-a21b-4004-925d-a9a818bd0f7c already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 296.10it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11374.38it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 239.89it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.62it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 447.30it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 475.76it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4960.15it/s]\n",
-            "Evaluating step(43): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 464.52it/s]\n",
-            "Training Step: 44:  67%|██████▋   | 8/12 [00:30<00:10,  2.51s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n",
-            "Training Step: 44\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 237.83it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id aefd17e5-9682-4420-a820-c484a63d6dcd already exists. Updating the trace.\n",
-            "Trace with id 2bc992c0-9832-47f1-87c3-9f6e4b18ee99 already exists. Updating the trace.Trace with id 945f82c7-03d9-4f49-8267-be7abac2bce6 already exists. Updating the trace.\n",
-            "Trace with id 12a6ff3d-f54d-4d89-b5f0-1aec30e96398 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 1138.91it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 394.77it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 443.51it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 247.66it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.52it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 373.33it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 830.43it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['aefd17e5-9682-4420-a820-c484a63d6dcd']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each vegetable step by step:\\n\\n\\n  1. Carrot: 1\\n\\n  2. Onion: 1\\n\\n  3. Stalk of celery: 1\\n\\n  4. Yams: 3\\n\\n  5. Garlic: 1\\n\\n  6. Head of broccoli: 1\\n\\n  7. Potato: 1\\n\\n\\n  Now, let''s add them up:\\n\\n\\n  1 + 1 + 1 + 3 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1197.95it/s]\n",
-            "Evaluating step(44): 0.8333 across 42 samples, Max potential: 0.86:  84%|████████▍ | 42/50 [00:22<00:04,  1.87it/s]\n",
-            "Training Step: 45:  75%|███████▌  | 9/12 [00:57<00:24,  8.31s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.813953488372093 <= 0.86, revert\n",
-            "Training Step: 45\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 164.91it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 701be0ee-29e0-42f5-be04-72d2b73e3968 already exists. Updating the trace.\n",
-            "Trace with id e2bfbbe0-fb79-4df5-9a7d-50c9085947bc already exists. Updating the trace.\n",
-            "Trace with id d4194dd1-739a-4509-8ac8-7c3f89649ee7 already exists. Updating the trace.\n",
-            "Trace with id 1b4b3ab0-d20f-4fc2-a09c-4592a227a8e5 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 731.86it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 244.23it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 395.27it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 140.54it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.11it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 448.16it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 658.37it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2621.44it/s]\n",
-            "Evaluating step(45): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 306.53it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 46:  92%|█████████▏| 11/12 [00:59<00:06,  6.78s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 46\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 256.89it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 82cf82ff-d826-4bb1-847c-9938aeec8ff5 already exists. Updating the trace.\n",
-            "Trace with id 5124e2e6-2aac-4dd3-ab63-9277a7b806a7 already exists. Updating the trace.Trace with id a9a202f5-e723-4d24-ae5e-ad1084a52ef8 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 426.47it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d3f33ded-170a-4b87-9b0b-987d5fb7b817 already exists. Updating the trace.\n",
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 266.65it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 380.40it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 251.95it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.75it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 411.12it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 511.05it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['aefd17e5-9682-4420-a820-c484a63d6dcd']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each vegetable step by step:\\n\\n\\n  1. Carrot: 1\\n\\n  2. Onion: 1\\n\\n  3. Stalk of celery: 1\\n\\n  4. Yams: 3\\n\\n  5. Garlic: 1\\n\\n  6. Head of broccoli: 1\\n\\n  7. Potato: 1\\n\\n\\n  Now, let''s add them up:\\n\\n\\n  1 + 1 + 1 + 3 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4016.92it/s]\n",
-            "Evaluating step(46): 0.8333 across 42 samples, Max potential: 0.86:  84%|████████▍ | 42/50 [00:00<00:00, 303.81it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.813953488372093 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 47: 100%|██████████| 12/12 [01:01<00:00,  4.42s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 47\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 96.23it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 85d63f78-39c0-4753-a9fc-52202df48673 already exists. Updating the trace.Trace with id 74d1bc97-46cd-406d-8c3a-2f999aae1b2f already exists. Updating the trace.\n",
-            "\n",
-            "Trace with id 60866bed-8020-4610-a39a-a4a730c035db already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 341.47it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 1d3eceeb-ad24-40f6-8752-2f38241172cb already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 167.75it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 846.95it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 136.09it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:04<00:00,  1.03s/it]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 191.47it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 923.91it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2836.52it/s]\n",
-            "Evaluating step(47): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 371.59it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 48: : 13it [01:07,  4.63s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 48\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 189.96it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 96c716a1-e984-4fe3-9ce0-e156ac709edb already exists. Updating the trace.\n",
-            "Trace with id 3835ee47-6951-49ec-b285-621fc1085024 already exists. Updating the trace.Trace with id 99607986-e107-46b8-b86b-177b295983c4 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 295.41it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Trace with id d46e538c-832d-4eb5-ba9b-a308f666baba already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 161.24it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1621.93it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 153.47it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.07it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 207.08it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 344.25it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s list and count the vegetables mentioned:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (two cabbages)\\n\\n  4. Garlic\\n\\n  5. Carrot\\n\\n  6. Broccoli (head of broccoli)\\n\\n  7. Potato\\n\\n  8. Celery (stalk of celery)\\n\\n  9. Lettuce (lettuce head)\\n\\n\\n  Now, let''s count each vegetable:\\n\\n\\n  1. Yam: 1\\n\\n  2. Cauliflower: 1\\n\\n  3. Cabbages: 2\\n\\n  4. Garlic: 1\\n\\n  5. Carrot: 1\\n\\n  6. Broccoli: 1\\n\\n  7. Potato: 1\\n\\n  8. Celery: 1\\n\\n  9. Lettuce: 1\\n\\n\\n  Adding them up:\\n\\n\\n  1 + 1 + 2 + 1 + 1 + 1 + 1 + 1 + 1 = 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1430.74it/s]\n",
-            "Evaluating step(48): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:28<00:07,  1.41it/s]\n",
-            "Training Step: 49: : 14it [01:39, 11.59s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n",
-            "Training Step: 49\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 122.71it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id ffe67a7b-7b81-4302-b6ed-4b506570274b already exists. Updating the trace.\n",
-            "Trace with id 8895d6bd-eab0-48af-ad4b-51f8007258b1 already exists. Updating the trace.\n",
-            "Trace with id d71ad721-d21d-42f1-af9b-719ff026406b already exists. Updating the trace.Trace with id e250f80e-334e-4f85-ac1f-df9a2013d578 already exists. Updating the trace.\n",
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 421.38it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 121.46it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1767.14it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 166.47it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.02it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 206.20it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 983.31it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3168.14it/s]\n",
-            "Evaluating step(49): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 492.44it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 50: : 16it [01:42,  9.33s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 50\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 108.30it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id c42fea48-1b90-4388-92c4-b65b4356a3a2 already exists. Updating the trace.\n",
-            "Trace with id 660c5004-35d2-4a6d-9a06-1e0b3f032f21 already exists. Updating the trace.\n",
-            "Trace with id 7694df14-3a24-40bd-a3fa-036c2645eca3 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 220.83it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1212.75it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 90.57it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.12it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 208.93it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1002.82it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
-            "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2261.91it/s]\n",
-            "Evaluating step(50): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 281.78it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 50: 100%|██████████| 12/12 [01:49<00:00,  9.15s/it]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Saved ckpt to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
-            "Training time: 352.5873613357544s\n",
-            "ckpt_file: /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        }
-      ],
-      "source": [
-        "\n",
-        "ckpt_path = \"/content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\"\n",
-        "\n",
-        "train(debug=False, max_steps=12, strategy=\"constrained\",\n",
-        "                raw_shots=0, bootstrap_shots=1,\n",
-        "                resume_from_ckpt=ckpt_path,\n",
-        "                exclude_input_fields_from_bootstrap_demos=True)"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 225.14it/s]\n",
+      "Evaluating step(4): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  2.43it/s]\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "m5fZGQqLE78r"
-      },
-      "source": [
-        "I decide to try more, this time, using strategy \"random\". And in the bootstrap demo, there is one shot, but I ensure I also add the \"input\" in the demonstration."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 1.0 > 0.6666666666666666\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "78JAv4ULEn07",
-        "outputId": "e87bb360-fc26-4dbd-d163-86ab32c292df"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
-            "ObjectCountAdalComponent(\n",
-            "  eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
-            "  (task): ObjectCountTaskPipeline(\n",
-            "    (llm_counter): Generator(\n",
-            "      model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "      (prompt): Prompt(\n",
-            "        template: <START_OF_SYSTEM_PROMPT>\n",
-            "        {{system_prompt}}\n",
-            "        {# Few shot demos #}\n",
-            "        {% if few_shot_demos is not none %}\n",
-            "        Here are some examples:\n",
-            "        {{few_shot_demos}}\n",
-            "        {% endif %}\n",
-            "        <END_OF_SYSTEM_PROMPT>\n",
-            "        <START_OF_USER>\n",
-            "        {{input_str}}\n",
-            "        <END_OF_USER>\n",
-            "        , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "      )\n",
-            "      (model_client): OpenAIClient()\n",
-            "      (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "    )\n",
-            "  )\n",
-            "  (loss_fn): EvalFnToTextLoss()\n",
-            ")\n",
-            "Trainer(\n",
-            "  (adaltask): ObjectCountAdalComponent(\n",
-            "    eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
-            "    (task): ObjectCountTaskPipeline(\n",
-            "      (llm_counter): Generator(\n",
-            "        model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "        (prompt): Prompt(\n",
-            "          template: <START_OF_SYSTEM_PROMPT>\n",
-            "          {{system_prompt}}\n",
-            "          {# Few shot demos #}\n",
-            "          {% if few_shot_demos is not none %}\n",
-            "          Here are some examples:\n",
-            "          {{few_shot_demos}}\n",
-            "          {% endif %}\n",
-            "          <END_OF_SYSTEM_PROMPT>\n",
-            "          <START_OF_USER>\n",
-            "          {{input_str}}\n",
-            "          <END_OF_USER>\n",
-            "          , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "        )\n",
-            "        (model_client): OpenAIClient()\n",
-            "        (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "      )\n",
-            "    )\n",
-            "    (loss_fn): EvalFnToTextLoss()\n",
-            "  )\n",
-            ")\n",
-            "raw_shots: 0, bootstrap_shots: 1\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator configured.\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Backward engine configured for all generators.\n",
-            "Restoring prompts: PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True)\n",
-            "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 51:   0%|          | 0/13 [00:00<?, ?it/s]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 415.27it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 224.54it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 423.57it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10894.30it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2199.38it/s]\n",
-            "Evaluating step(51): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:12<00:00,  3.97it/s]\n",
-            "Training Step: 52:   8%|▊         | 1/13 [00:18<03:38, 18.20s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.86 <= 0.86\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 402.10it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 785.01it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 842.02it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 6660.27it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1760.33it/s]\n",
-            "Evaluating step(52): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:12<00:00,  3.96it/s]\n",
-            "Training Step: 53:  15%|█▌        | 2/13 [00:36<03:21, 18.28s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.86 <= 0.86\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 571.26it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:00<00:00, 988.41it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 608.29it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1177.76it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2074.29it/s]\n",
-            "Evaluating step(53): 0.88 across 50 samples, Max potential: 0.88: 100%|██████████| 50/50 [00:16<00:00,  3.07it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer step: 0.88 > 0.86\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 5848.08it/s]\n",
-            "Evaluating step(53): 0.9 across 100 samples, Max potential: 0.9: 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]\n",
-            "Training Step: 54:  23%|██▎       | 3/13 [01:28<05:35, 33.51s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 297.78it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.95it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 407.40it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 8952.62it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1274.72it/s]\n",
-            "Evaluating step(54): 0.94 across 50 samples, Max potential: 0.94: 100%|██████████| 50/50 [00:16<00:00,  3.06it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer step: 0.94 > 0.88\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 6831.78it/s]\n",
-            "Evaluating step(54): 0.91 across 100 samples, Max potential: 0.91: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]\n",
-            "Training Step: 55:  31%|███       | 4/13 [02:21<06:10, 41.21s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 152.84it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 688.86it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1318.45it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data='You will answer a reasoning question. Carefully count each item and verify your total. List each item individually, ensuring each is counted as \"1\" regardless of quantity mentioned. Show your calculations step by step. The last line of your response should be: \\'Answer: $VALUE\\' where VALUE is a numerical value.', requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2011.16it/s]\n",
-            "Evaluating step(55): 0.8696 across 23 samples, Max potential: 0.94:  46%|████▌     | 23/50 [00:15<00:17,  1.52it/s]\n",
-            "Training Step: 56:  38%|███▊      | 5/13 [02:46<04:43, 35.43s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.8333333333333334 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.66it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 646.55it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2217.45it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4572.35it/s]\n",
-            "Evaluating step(56): 0.94 across 50 samples, Max potential: 0.94: 100%|██████████| 50/50 [00:00<00:00, 390.77it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.94 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 57:  46%|████▌     | 6/13 [02:54<03:02, 26.03s/it]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 145.48it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.52it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 375.76it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1437.76it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 0.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check for any grouped items and count them correctly. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1614.47it/s]\n",
-            "Evaluating step(57): 0.7857 across 14 samples, Max potential: 0.94:  28%|██▊       | 14/50 [00:19<00:50,  1.41s/it]\n",
-            "Training Step: 58:  54%|█████▍    | 7/13 [03:23<02:42, 27.04s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.7333333333333333 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 137.96it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.94it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 806.79it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11522.81it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be formatted as: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3560.17it/s]\n",
-            "Evaluating step(58): 0.88 across 25 samples, Max potential: 0.94:  50%|█████     | 25/50 [00:17<00:17,  1.45it/s]\n",
-            "Training Step: 59:  62%|██████▏   | 8/13 [03:47<02:10, 26.06s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.8461538461538461 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.90it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.70it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 552.01it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 5648.89it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 0.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1770.11it/s]\n",
-            "Evaluating step(59): 0.9286 across 42 samples, Max potential: 0.94:  84%|████████▍ | 42/50 [00:16<00:03,  2.49it/s]\n",
-            "Training Step: 60:  69%|██████▉   | 9/13 [04:13<01:43, 26.00s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.9069767441860465 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 314.86it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.10it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 722.53it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 7940.00it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count for precision. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 7188.43it/s]\n",
-            "Evaluating step(60): 0.8966 across 29 samples, Max potential: 0.94:  58%|█████▊    | 29/50 [00:15<00:11,  1.84it/s]\n",
-            "Training Step: 61:  77%|███████▋  | 10/13 [04:35<01:14, 24.87s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.8666666666666667 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 95.68it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.74it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 587.05it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12520.31it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3036.62it/s]\n",
-            "Evaluating step(61): 0.9286 across 42 samples, Max potential: 0.94:  84%|████████▍ | 42/50 [00:00<00:00, 327.89it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.9069767441860465 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 62:  85%|████████▍ | 11/13 [04:44<00:40, 20.14s/it]\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 136.40it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  3.17it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 417.11it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14339.50it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Loss backward...\n",
-            "setting pred name Generator_outputy_pred_2 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_0 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_3 score to 1.0\n",
-            "setting pred name Generator_outputy_pred_1 score to 1.0\n",
-            "Optimizer propose...\n",
-            "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 5035.06it/s]\n",
-            "Evaluating step(62): 0.9286 across 42 samples, Max potential: 0.94:  84%|████████▍ | 42/50 [00:00<00:00, 327.19it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Optimizer revert: 0.9069767441860465 <= 0.94\n",
-            "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 62:  92%|█████████▏| 12/13 [04:51<00:24, 24.28s/it]\n",
-            "Epoch:   0%|          | 0/1 [04:51<?, ?it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Reached max steps\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    {{system_prompt}}\n",
-            "    {# Few shot demos #}\n",
-            "    {% if few_shot_demos is not none %}\n",
-            "    Here are some examples:\n",
-            "    {{few_shot_demos}}\n",
-            "    {% endif %}\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <START_OF_USER>\n",
-            "    {{input_str}}\n",
-            "    <END_OF_USER>\n",
-            "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
-            ")\n",
-            "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
-            "Configuring teacher generator for Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            ")\n",
-            "Teacher generator set: Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            "), teacher Generator(\n",
-            "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
-            "  (prompt): Prompt(\n",
-            "    template: <START_OF_SYSTEM_PROMPT>\n",
-            "    You are the feedback engine in an optimization system.\n",
-            "    \n",
-            "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
-            "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
-            "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
-            "    3. Consider the variable in the context of its peers if provided.\n",
-            "    Remember:\n",
-            "    Be concise, critical, and direct.\n",
-            "    <END_OF_SYSTEM_PROMPT>\n",
-            "    <CONVERSATION>\n",
-            "    {{conversation_sec}}\n",
-            "    </CONVERSATION>\n",
-            "    {{objective_instruction_sec}}\n",
-            "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
-            "  )\n",
-            "  (model_client): OpenAIClient()\n",
-            ")\n",
-            "Teacher generator configured.\n",
-            "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
-            "Starting step: 62\n",
-            "trainer_results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 63:   0%|          | 0/12 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 63\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 175.38it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id fd34672a-ffd1-498e-a88f-283aa9d4f65d already exists. Updating the trace.\n",
-            "Trace with id 82cf82ff-d826-4bb1-847c-9938aeec8ff5 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 46a8994f-fce6-4031-b251-1c8af31d88d2 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00,  4.32it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 445.92it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9063.87it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 132.51it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.85it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 913.05it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1900.02it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2838.94it/s]\n",
-            "Evaluating step(63): 0.5 across 6 samples, Max potential: 0.94:  12%|█▏        | 6/50 [00:31<03:48,  5.20s/it]\n",
-            "Training Step: 64:  17%|█▋        | 2/12 [00:36<06:01, 36.20s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.42857142857142855 <= 0.94, revert\n",
-            "Training Step: 64\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 173.87it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 60866bed-8020-4610-a39a-a4a730c035db already exists. Updating the trace.\n",
-            "Trace with id 7694df14-3a24-40bd-a3fa-036c2645eca3 already exists. Updating the trace."
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 3835ee47-6951-49ec-b285-621fc1085024 already exists. Updating the trace.\n",
-            "\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:00<00:00,  4.64it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d3f33ded-170a-4b87-9b0b-987d5fb7b817 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 1138.44it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 3232.60it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 151.65it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 725.72it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10845.00it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2971.02it/s]\n",
-            "Evaluating step(64): 0.5 across 6 samples, Max potential: 0.94:  12%|█▏        | 6/50 [00:00<00:00, 136.83it/s]\n",
-            "Training Step: 65:  33%|███▎      | 4/12 [00:41<01:29, 11.21s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.42857142857142855 <= 0.94, revert\n",
-            "Training Step: 65\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 201.47it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 99607986-e107-46b8-b86b-177b295983c4 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id ffe67a7b-7b81-4302-b6ed-4b506570274b already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  50%|█████     | 2/4 [00:00<00:00,  2.54it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 71d549d2-9cc8-46ba-a7f6-d07f69263fd3 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.89it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 4cd9f4ec-2648-4e85-8e17-3dae1b8558d3 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 402.70it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 6304.85it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 218.71it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.19it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 858.52it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 768.93it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1635.33it/s]\n",
-            "Evaluating step(65): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:23<00:00,  2.16it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Pass validation: 0.96 > 0.94\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 3294.35it/s]\n",
-            "Evaluating step(65): 0.95 across 100 samples, Max potential: 0.95: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]\n",
-            "Training Step: 66:  42%|████▏     | 5/12 [01:50<02:42, 23.20s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 66\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 186.04it/s]\n",
-            "Training:  25%|██▌       | 1/4 [00:00<00:02,  1.01it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id fe9b883c-4f47-44f7-a388-b03a2fb10413 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  50%|█████     | 2/4 [00:01<00:01,  1.30it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 12a6ff3d-f54d-4d89-b5f0-1aec30e96398 already exists. Updating the trace.\n",
-            "Trace with id 840d9ed5-8222-45a9-a406-7445feae9733 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.46it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 3a9a47c8-a210-43a4-8d24-b9159babb6e4 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 636.54it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9420.11it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 111.34it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.50it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 321.28it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 731.61it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1120.89it/s]\n",
-            "Evaluating step(66): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 212.00it/s]\n",
-            "Training Step: 67:  58%|█████▊    | 7/12 [01:55<01:32, 18.51s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.96 <= 0.96, revert\n",
-            "Training Step: 67\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data:   0%|          | 0/4 [00:00<?, ?it/s]\u001b[A\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 31.60it/s]\n",
-            "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 5124e2e6-2aac-4dd3-ab63-9277a7b806a7 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  25%|██▌       | 1/4 [00:01<00:05,  1.78s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id ac43f3d4-d67d-4912-95d6-0baa09b52d9a already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  75%|███████▌  | 3/4 [00:02<00:00,  1.63it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d71ad721-d21d-42f1-af9b-719ff026406b already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id c42fea48-1b90-4388-92c4-b65b4356a3a2 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 420.84it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 533.39it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 48.64it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.64it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 396.85it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 8608.11it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4202.88it/s]\n",
-            "Evaluating step(67): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 405.51it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.96 <= 0.96, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 68:  67%|██████▋   | 8/12 [02:02<00:47, 11.99s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 68\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 77.30it/s]\n",
-            "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id d4194dd1-739a-4509-8ac8-7c3f89649ee7 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training:  75%|███████▌  | 3/4 [00:01<00:00,  2.62it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 1eb770ed-ff6f-481e-8c16-b9749a44a1a6 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.46it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 54e272c5-1360-462e-b773-4c58c61472ee already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 212.56it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10831.00it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 179.03it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.09it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 502.04it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 639.84it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3305.62it/s]\n",
-            "Evaluating step(68): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 539.54it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.96 <= 0.96, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 69:  75%|███████▌  | 9/12 [02:09<00:32, 10.69s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 69\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 84.70it/s]\n",
-            "Training:  25%|██▌       | 1/4 [00:01<00:03,  1.26s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 0e8910c8-703d-4766-a483-c5691125fd03 already exists. Updating the trace.\n",
-            "Trace with id 74d1bc97-46cd-406d-8c3a-2f999aae1b2f already exists. Updating the trace.\n",
-            "Trace with id 701be0ee-29e0-42f5-be04-72d2b73e3968 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.67it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id de4e75d6-a21b-4004-925d-a9a818bd0f7c already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 331.49it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 488.36it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 274.35it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.51it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 596.31it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14678.23it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4604.98it/s]\n",
-            "Evaluating step(69): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:00<00:00, 88.47it/s]\n",
-            "Training Step: 70:  92%|█████████▏| 11/12 [02:13<00:08,  8.97s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.4 <= 0.96, revert\n",
-            "Training Step: 70\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 169.70it/s]\n",
-            "Training:  25%|██▌       | 1/4 [00:01<00:03,  1.03s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id e2bfbbe0-fb79-4df5-9a7d-50c9085947bc already exists. Updating the trace.\n",
-            "Trace with id 6c34d6e5-0e3d-4243-834e-fd6c5883f467 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.45it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 1b4b3ab0-d20f-4fc2-a09c-4592a227a8e5 already exists. Updating the trace.\n",
-            "Trace with id aefd17e5-9682-4420-a820-c484a63d6dcd already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 285.47it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 288.20it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 262.75it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.60it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 293.12it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1091.27it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4087.46it/s]\n",
-            "Evaluating step(70): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 345.89it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.96 <= 0.96, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 71: 100%|██████████| 12/12 [02:17<00:00,  6.07s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 71\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 87.52it/s]\n",
-            "Training:  25%|██▌       | 1/4 [00:01<00:03,  1.33s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 6c0d3a9a-bb01-4fb3-a68b-1edf66861235 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  50%|█████     | 2/4 [00:01<00:01,  1.37it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 234e39df-1bc4-41df-a515-895cb2614a53 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  75%|███████▌  | 3/4 [00:01<00:00,  1.92it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 8895d6bd-eab0-48af-ad4b-51f8007258b1 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.87it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 4dad0f65-d624-48c2-a795-596c00b0535a already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 262.50it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10369.11it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 43.64it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 321.11it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1141.46it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4407.91it/s]\n",
-            "Evaluating step(71): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 397.66it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.96 <= 0.96, revert\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Training Step: 72: : 13it [02:23,  6.04s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training Step: 72\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 113.31it/s]\n",
-            "Training:  25%|██▌       | 1/4 [00:00<00:02,  1.04it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 85d63f78-39c0-4753-a9fc-52202df48673 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  50%|█████     | 2/4 [00:01<00:01,  1.82it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 433650a5-ca75-4867-b235-3af4a7c55c67 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\rTraining:  75%|███████▌  | 3/4 [00:01<00:00,  2.48it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id e250f80e-334e-4f85-ac1f-df9a2013d578 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.86it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 1d3eceeb-ad24-40f6-8752-2f38241172cb already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 170.72it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 13981.01it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 195.45it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:01<00:00,  2.46it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 241.42it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 322.68it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3970.90it/s]\n",
-            "Evaluating step(72): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:00<00:00, 96.75it/s] \n",
-            "Training Step: 73: : 14it [02:30,  6.33s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.4 <= 0.96, revert\n",
-            "Training Step: 73\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 73.23it/s]\n",
-            "Training:  25%|██▌       | 1/4 [00:01<00:05,  1.97s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id daa5804f-1aad-4f01-b26c-6b31c57f065f already exists. Updating the trace.\n",
-            "Trace with id dd9d8748-4926-4bcd-902d-6a4c5cb38267 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.80it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 04e77795-cc9b-4530-a883-5f775e3fbc76 already exists. Updating the trace.\n",
-            "Trace with id 1f682cab-026c-4803-8018-a45d027aa026 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 211.00it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1551.58it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 205.19it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.80it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 266.20it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1059.57it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2226.42it/s]\n",
-            "Evaluating step(73): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:00<00:00, 91.13it/s]\n",
-            "Training Step: 74: : 16it [02:35,  6.09s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.4 <= 0.96, revert\n",
-            "Training Step: 74\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.65it/s]\n",
-            "Training:  25%|██▌       | 1/4 [00:01<00:04,  1.36s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id 945f82c7-03d9-4f49-8267-be7abac2bce6 already exists. Updating the trace.\n",
-            "Trace with id a9a202f5-e723-4d24-ae5e-ad1084a52ef8 already exists. Updating the trace.\n",
-            "Trace with id d46e538c-832d-4eb5-ba9b-a308f666baba already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.59it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Trace with id b538075d-01af-4b76-b835-9005f3044609 already exists. Updating the trace.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 115.74it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1086.11it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 225.28it/s]\n",
-            "Training: 100%|██████████| 4/4 [00:02<00:00,  1.50it/s]\n",
-            "\n",
-            "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 224.88it/s]\n",
-            "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14242.12it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "sampled_augmented_demos: ['b538075d-01af-4b76-b835-9005f3044609']\n",
-            "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a blackberry, a raspberry, a peach, a head of broccoli, a plum,\\n  an orange, two bananas, a grape, two garlics, a nectarine, a lettuce head, and an\\n  apple. How many fruits do I have?\\nExample: 'Let''s list each item and identify whether it is a fruit:\\n\\n\\n  1. Blackberry - Fruit\\n\\n  2. Raspberry - Fruit\\n\\n  3. Peach - Fruit\\n\\n  4. Head of broccoli - Not a fruit\\n\\n  5. Plum - Fruit\\n\\n  6. Orange - Fruit\\n\\n  7. Two bananas - Fruits (2 bananas)\\n\\n  8. Grape - Fruit\\n\\n  9. Two garlics - Not fruits\\n\\n  10. Nectarine - Fruit\\n\\n  11. Lettuce head - Not a fruit\\n\\n  12. Apple - Fruit\\n\\n\\n  Now, let''s count the fruits:\\n\\n\\n  1. Blackberry\\n\\n  2. Raspberry\\n\\n  3. Peach\\n\\n  4. Plum\\n\\n  5. Orange\\n\\n  6. Two bananas (counted as 2)\\n\\n  7. Grape\\n\\n  8. Nectarine\\n\\n  9. Apple\\n\\n\\n  Total number of fruits:\\n\\n  1 + 1 + 1 + 1 + 1 + 2 + 1 + 1 + 1 = 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 634.93it/s]\n",
-            "Evaluating step(74): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:28<05:25,  7.07s/it]\n",
-            "Training Step: 74: 100%|██████████| 12/12 [03:12<00:00, 16.04s/it]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fail validation: 0.4 <= 0.96, revert\n",
-            "Saved ckpt to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
-            "Training time: 484.17421078681946s\n",
-            "ckpt_file: /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "\n",
-        "train(debug=False, max_steps=12, strategy=\"random\",\n",
-        "                raw_shots=0, bootstrap_shots=1,\n",
-        "                resume_from_ckpt=ckpt_path,\n",
-        "                exclude_input_fields_from_bootstrap_demos=False)"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 445.28it/s]\n",
+      "Evaluating step(4): 1.0 across 4 samples, Max potential: 1.0: 100%|██████████| 4/4 [00:01<00:00,  2.67it/s]\n",
+      "Proposing:   0%|          | 0/5 [00:03<?, ?it/s]\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xTB4lO3PFPnP"
-      },
-      "source": [
-        "Finally, we got 96% on the val and 95% on the test!!! This is really close to GPT4o's performance. This took us 72 steps!\n",
-        "\n",
-        "The score is consistent, meaning this is a good prompt.\n",
-        "Here is our final optimized prompt:\n",
-        "\n",
-        "System:\n",
-        "\n",
-        "```\n",
-        "\n",
-        "\"prompt\": [\n",
-        "                {\n",
-        "                    \"id\": \"327b63f0-b532-435a-85d7-6137d4e52c4c\",\n",
-        "                    \"name\": \"llm_counter.system_prompt\",\n",
-        "                    \"data\": \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\",\n",
-        "                    \"requires_opt\": true\n",
-        "                },\n",
-        "                {\n",
-        "                    \"id\": \"73a3953b-6351-44d8-a36f-7521db346cca\",\n",
-        "                    \"name\": \"llm_counter.few_shot_demos\",\n",
-        "                    \"data\": \"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\",\n",
-        "                    \"requires_opt\": true\n",
-        "                }\n",
-        "            ]\n",
-        "```\n",
-        "\n",
-        "\n",
-        "You will see all steps record from the log."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 1.0 >= 0.75\n",
+      "Done with proposals\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Fr0V3XNCHAis"
-      },
-      "source": [
-        "Happy Optimizing!!!"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1139.66it/s]\n",
+      "Evaluating step(5): 0.84 across 50 samples, Max potential: 0.84: 100%|██████████| 50/50 [00:16<00:00,  3.04it/s]\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "3Wnvqs3RyI_z"
-      },
-      "source": [
-        "# Issues and feedback\n",
-        "\n",
-        "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n",
-        "\n",
-        "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)."
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer step: 0.84 > 0.8\n"
+     ]
     },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 1658.72it/s]\n",
+      "Evaluating step(4): 0.91 across 100 samples, Max potential: 0.91: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]\n",
+      "Training Step: 6:  38%|███▊      | 5/13 [00:56<02:18, 17.27s/it]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 207.97it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.86it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 494.99it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 805.09it/s]\n"
+     ]
     },
-    "language_info": {
-      "name": "python"
-    }
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.75\n",
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "Subset loss backward time: 4.081957817077637\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 538.35it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00,  3.13it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:08,  2.13s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 151.18it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 204.61it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.66s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 698.62it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 571.41it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:05<00:03,  1.61s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 116.83it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.50it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:07<00:01,  1.88s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 399.65it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 571.09it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:08<00:00,  1.69s/it]\n",
+      "Training Step: 7:  46%|████▌     | 6/13 [01:09<01:53, 16.18s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 59.06it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 410.78it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 4694.24it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 7\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.875\n",
+      "Moving batch correct size: 7\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "Subset loss backward time: 3.0843119621276855\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 154.50it/s]\n",
+      "Evaluating step(6): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  1.52it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 1.0 > 0.6666666666666666\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 8/8 [00:00<00:00, 279.47it/s]\n",
+      "Evaluating step(6): 0.875 across 8 samples, Max potential: 0.875: 100%|██████████| 8/8 [00:01<00:00,  4.43it/s]\n",
+      "Proposing:   0%|          | 0/5 [00:04<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 0.875 >= 0.875\n",
+      "Done with proposals\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2336.58it/s]\n",
+      "Evaluating step(7): 0.84 across 50 samples, Max potential: 0.84: 100%|██████████| 50/50 [00:17<00:00,  2.88it/s]\n",
+      "Training Step: 8:  54%|█████▍    | 7/13 [01:37<01:58, 19.81s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.84 <= 0.84\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 148.75it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.04it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 345.11it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 7550.50it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 11\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.9166666666666666\n",
+      "Moving batch correct size: 11\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "Subset loss backward time: 2.337067127227783\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 193.84it/s]\n",
+      "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.16it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:09,  2.39s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified in words. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 147.89it/s]\n",
+      "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.04it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:04<00:07,  2.41s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 423.61it/s]\n",
+      "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 556.86it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:05<00:03,  1.78s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 532.41it/s]\n",
+      "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 522.78it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:06<00:01,  1.44s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item individually, especially when quantities are specified in words. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 284.18it/s]\n",
+      "Evaluating step(7): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 160.35it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.59s/it]\n",
+      "Training Step: 9:  62%|██████▏   | 8/13 [01:50<01:27, 17.55s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 87.73it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.62it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 342.85it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 7157.52it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 14\n",
+      "Moving batch error size: 2\n",
+      "Moving batch acc: 0.875\n",
+      "Moving batch correct size: 14\n",
+      "Moving batch error size: 2\n",
+      "Subset Error size: 2\n",
+      "Subset Correct size: 4\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_0 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "Subset loss backward time: 7.823317050933838\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 166.50it/s]\n",
+      "Evaluating step(8): 0.8333 across 6 samples, Max potential: 0.8333: 100%|██████████| 6/6 [00:02<00:00,  2.78it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 0.8333333333333334 > 0.6666666666666666\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 16/16 [00:00<00:00, 481.75it/s]\n",
+      "Evaluating step(8): 0.875 across 16 samples, Max potential: 0.875: 100%|██████████| 16/16 [00:03<00:00,  5.21it/s]\n",
+      "Proposing:   0%|          | 0/5 [00:06<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 0.875 >= 0.875\n",
+      "Done with proposals\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1112.82it/s]\n",
+      "Evaluating step(9): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:16<00:00,  2.97it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer step: 0.86 > 0.84\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 2395.58it/s]\n",
+      "Evaluating step(8): 0.87 across 100 samples, Max potential: 0.87: 100%|██████████| 100/100 [00:30<00:00,  3.30it/s]\n",
+      "Training Step: 10:  69%|██████▉   | 9/13 [02:52<02:04, 31.23s/it]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 212.83it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.04it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 655.18it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1241.84it/s]\n",
+      "Training Step: 11:  77%|███████▋  | 10/13 [02:55<01:07, 22.43s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping batch 9 as acc: 1.0\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.95it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.23it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 757.71it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1320.62it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.75\n",
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "Subset loss backward time: 3.768970012664795\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 125.10it/s]\n",
+      "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.77it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:08,  2.19s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 571.28it/s]\n",
+      "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 429.07it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.58s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to categories and quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 111.64it/s]\n",
+      "Evaluating step(10): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  2.63it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 1.0 > 0.6666666666666666\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 455.77it/s]\n",
+      "Evaluating step(10): 1.0 across 4 samples, Max potential: 1.0: 100%|██████████| 4/4 [00:00<00:00,  5.14it/s]\n",
+      "Proposing:  40%|████      | 2/5 [00:06<00:09,  3.17s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 1.0 >= 0.75\n",
+      "Done with proposals\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1732.93it/s]\n",
+      "Evaluating step(11): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:18<00:04,  2.21it/s]\n",
+      "Training Step: 12:  85%|████████▍ | 11/13 [03:24<00:49, 24.61s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.8048780487804879 <= 0.86\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 128.86it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.24it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 470.20it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2608.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 6\n",
+      "Moving batch error size: 2\n",
+      "Moving batch acc: 0.75\n",
+      "Moving batch correct size: 6\n",
+      "Moving batch error size: 2\n",
+      "Subset Error size: 2\n",
+      "Subset Correct size: 4\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "Subset loss backward time: 6.722561836242676\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 265.78it/s]\n",
+      "Evaluating step(11): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:01<00:00,  3.58it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:10,  2.65s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 396.33it/s]\n",
+      "Evaluating step(11): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 354.51it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:03<00:05,  1.80s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, ensuring you categorize them correctly. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 789.39it/s]\n",
+      "Evaluating step(11): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 233.79it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.49s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each relevant item, excluding any that do not fit the category. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 181.12it/s]\n",
+      "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:02<00:00,  2.13it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:08<00:02,  2.44s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each relevant item, excluding any that do not fit the category. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 807.04it/s]\n",
+      "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 275.78it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:10<00:00,  2.01s/it]\n",
+      "Training Step: 12:  92%|█████████▏| 12/13 [03:43<00:18, 18.61s/it]\n",
+      "Epoch: 100%|██████████| 1/1 [03:43<00:00, 223.37s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n",
+      "Reached max steps\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      ")\n",
+      "Teacher generator configured.\n",
+      "save to /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
+      "Starting step: 12\n",
+      "trainer_results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 13:   0%|          | 0/12 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 13\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 158.10it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 6c34d6e5-0e3d-4243-834e-fd6c5883f467 already exists. Updating the trace.Trace with id 234e39df-1bc4-41df-a515-895cb2614a53 already exists. Updating the trace.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.35it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 8895d6bd-eab0-48af-ad4b-51f8007258b1 already exists. Updating the trace.\n",
+      "Trace with id c42fea48-1b90-4388-92c4-b65b4356a3a2 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 490.46it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1656.19it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 247.40it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.77it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 365.97it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9294.86it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['c42fea48-1b90-4388-92c4-b65b4356a3a2']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Trombone: 1\\n\\n  2. Violin: 1\\n\\n  3. Clarinet: 1\\n\\n  4. Accordion: 1\\n\\n  5. Flutes: 4\\n\\n  6. Trumpet: 1\\n\\n  7. Drums: 2\\n\\n  8. Piano: 1\\n\\n\\n  Now, let''s add them up:\\n\\n\\n  1 + 1 + 1 + 1 + 4 + 1 + 2 + 1 = 12\\n\\n\\n  Answer: 12'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2578.13it/s]\n",
+      "Evaluating step(13): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:23<00:13,  1.35it/s]\n",
+      "Training Step: 14:  17%|█▋        | 2/12 [00:27<04:35, 27.54s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
+      "Training Step: 14\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 136.94it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id fd34672a-ffd1-498e-a88f-283aa9d4f65d already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.92it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 46a8994f-fce6-4031-b251-1c8af31d88d2 already exists. Updating the trace.\n",
+      "Trace with id 2bc992c0-9832-47f1-87c3-9f6e4b18ee99 already exists. Updating the trace.\n",
+      "Trace with id 12a6ff3d-f54d-4d89-b5f0-1aec30e96398 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 443.10it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 3302.60it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 114.14it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.59it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 685.93it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 5111.89it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1784.60it/s]\n",
+      "Evaluating step(14): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:19<00:11,  1.61it/s]\n",
+      "Training Step: 15:  33%|███▎      | 4/12 [00:52<02:10, 16.36s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
+      "Training Step: 15\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 164.67it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 660c5004-35d2-4a6d-9a06-1e0b3f032f21 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training:  25%|██▌       | 1/4 [00:00<00:02,  1.12it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d3f33ded-170a-4b87-9b0b-987d5fb7b817 already exists. Updating the trace.\n",
+      "Trace with id de4e75d6-a21b-4004-925d-a9a818bd0f7c already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.02it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 1f682cab-026c-4803-8018-a45d027aa026 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 665.05it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1875.18it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 160.86it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.14it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 621.42it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9054.08it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2982.93it/s]\n",
+      "Evaluating step(15): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 495.97it/s]\n",
+      "Training Step: 16:  42%|████▏     | 5/12 [00:56<01:03,  9.03s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
+      "Training Step: 16\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 127.68it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 0e8910c8-703d-4766-a483-c5691125fd03 already exists. Updating the trace.Trace with id ffe67a7b-7b81-4302-b6ed-4b506570274b already exists. Updating the trace.\n",
+      "Trace with id e250f80e-334e-4f85-ac1f-df9a2013d578 already exists. Updating the trace.\n",
+      "Trace with id fe9b883c-4f47-44f7-a388-b03a2fb10413 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 534.68it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 201.71it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10453.09it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 195.85it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.52it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 560.49it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1250.72it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3902.04it/s]\n",
+      "Evaluating step(16): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 398.91it/s]\n",
+      "Training Step: 17:  58%|█████▊    | 7/12 [00:58<00:35,  7.16s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
+      "Training Step: 17\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data:   0%|          | 0/4 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d46e538c-832d-4eb5-ba9b-a308f666baba already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rLoading Data: 100%|██████████| 4/4 [00:00<00:00, 106.99it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id a9a202f5-e723-4d24-ae5e-ad1084a52ef8 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training:  75%|███████▌  | 3/4 [00:00<00:00,  3.09it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 74d1bc97-46cd-406d-8c3a-2f999aae1b2f already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.92it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 334.77it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 874.86it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 370.55it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.81it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 482.84it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 645.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2888.08it/s]\n",
+      "Evaluating step(17): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 221.76it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 18:  67%|██████▋   | 8/12 [01:02<00:19,  4.87s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 18\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 111.28it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d4194dd1-739a-4509-8ac8-7c3f89649ee7 already exists. Updating the trace.Trace with id 1eb770ed-ff6f-481e-8c16-b9749a44a1a6 already exists. Updating the trace.\n",
+      "Trace with id 7694df14-3a24-40bd-a3fa-036c2645eca3 already exists. Updating the trace.\n",
+      "\n",
+      "Trace with id 4cd9f4ec-2648-4e85-8e17-3dae1b8558d3 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 585.96it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 225.18it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1038.07it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 250.95it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.18it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 438.82it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2456.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2422.27it/s]\n",
+      "Evaluating step(18): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 456.47it/s]\n",
+      "Training Step: 19:  75%|███████▌  | 9/12 [01:05<00:13,  4.41s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
+      "Training Step: 19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 57.52it/s]\n",
+      "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 5124e2e6-2aac-4dd3-ab63-9277a7b806a7 already exists. Updating the trace.\n",
+      "Trace with id 1d3eceeb-ad24-40f6-8752-2f38241172cb already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 3a9a47c8-a210-43a4-8d24-b9159babb6e4 already exists. Updating the trace.Trace with id 6c0d3a9a-bb01-4fb3-a68b-1edf66861235 already exists. Updating the trace.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 193.38it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 6143.25it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 107.12it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.42it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 375.70it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10505.46it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3644.75it/s]\n",
+      "Evaluating step(19): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 275.17it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 20:  92%|█████████▏| 11/12 [01:09<00:04,  4.32s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 20\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 125.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id b538075d-01af-4b76-b835-9005f3044609 already exists. Updating the trace.\n",
+      "Trace with id dd9d8748-4926-4bcd-902d-6a4c5cb38267 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 60866bed-8020-4610-a39a-a4a730c035db already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00,  4.20it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 85d63f78-39c0-4753-a9fc-52202df48673 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 328.35it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 999.36it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 239.24it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.87it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 353.26it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 391.07it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['60866bed-8020-4610-a39a-a4a730c035db']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Bed: 1\\n\\n  2. Fridge: 1\\n\\n  3. Lamp: 1\\n\\n  4. Toaster: 1\\n\\n  5. Chairs: 4\\n\\n  6. Table: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 1 + 1 + 4 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1763.23it/s]\n",
+      "Evaluating step(20): 0.7083 across 24 samples, Max potential: 0.86:  48%|████▊     | 24/50 [00:17<00:18,  1.38it/s]\n",
+      "Training Step: 21: 100%|██████████| 12/12 [01:34<00:00,  7.82s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.68 <= 0.86, revert\n",
+      "Training Step: 21\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 208.10it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id aefd17e5-9682-4420-a820-c484a63d6dcd already exists. Updating the trace.\n",
+      "Trace with id 04e77795-cc9b-4530-a883-5f775e3fbc76 already exists. Updating the trace.\n",
+      "Trace with id 701be0ee-29e0-42f5-be04-72d2b73e3968 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00,  4.56it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 433650a5-ca75-4867-b235-3af4a7c55c67 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 187.26it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2595.49it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 129.91it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.86it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 172.30it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 689.23it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3183.48it/s]\n",
+      "Evaluating step(21): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 438.75it/s]\n",
+      "Training Step: 22: : 13it [01:38,  6.76s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
+      "Training Step: 22\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data:   0%|          | 0/4 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 4dad0f65-d624-48c2-a795-596c00b0535a already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 86.81it/s]\n",
+      "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 1b4b3ab0-d20f-4fc2-a09c-4592a227a8e5 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.23it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 82cf82ff-d826-4bb1-847c-9938aeec8ff5 already exists. Updating the trace.\n",
+      "Trace with id ac43f3d4-d67d-4912-95d6-0baa09b52d9a already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 143.58it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 842.95it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 133.83it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.36it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 326.14it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 307.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['60866bed-8020-4610-a39a-a4a730c035db']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Bed: 1\\n\\n  2. Fridge: 1\\n\\n  3. Lamp: 1\\n\\n  4. Toaster: 1\\n\\n  5. Chairs: 4\\n\\n  6. Table: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 1 + 1 + 4 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 5440.79it/s]\n",
+      "Evaluating step(22): 0.7083 across 24 samples, Max potential: 0.86:  48%|████▊     | 24/50 [00:00<00:00, 303.26it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.68 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 23: : 14it [01:42,  6.13s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 23\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 91.93it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id daa5804f-1aad-4f01-b26c-6b31c57f065f already exists. Updating the trace.\n",
+      "Trace with id e2bfbbe0-fb79-4df5-9a7d-50c9085947bc already exists. Updating the trace.\n",
+      "Trace with id 71d549d2-9cc8-46ba-a7f6-d07f69263fd3 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.56it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 840d9ed5-8222-45a9-a406-7445feae9733 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 63.89it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 201.47it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 90.61it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.96it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 287.69it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1938.89it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3010.90it/s]\n",
+      "Evaluating step(23): 0.7812 across 32 samples, Max potential: 0.86:  64%|██████▍   | 32/50 [00:00<00:00, 423.98it/s]\n",
+      "Training Step: 24: : 16it [01:48,  6.22s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.7575757575757576 <= 0.86, revert\n",
+      "Training Step: 24\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 122.52it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 96c716a1-e984-4fe3-9ce0-e156ac709edb already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 000a3738-1f09-40b0-9f8b-2dec63a3f7f8 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.21it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d71ad721-d21d-42f1-af9b-719ff026406b already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 106.06it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1513.37it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 265.42it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.07it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 171.27it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 862.32it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['840d9ed5-8222-45a9-a406-7445feae9733']\n",
+      "New prompts: [PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item, paying special attention to quantities mentioned. Verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='2f948c14-7f8f-4f46-9e23-d30598d3f47b', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Table: 1\\n\\n  3. Fridge: 1\\n\\n  4. Stove: 1\\n\\n  5. Oven: 1\\n\\n  6. Toaster: 1\\n\\n  7. Couch: 1\\n\\n  8. Cars: 4\\n\\n\\n  Now, add them up:\\n\\n\\n  1 + 1 + 1 + 1 + 1 + 1 + 1 + 4 = 11\\n\\n\\n  Answer: 11'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1210.01it/s]\n",
+      "Evaluating step(24): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:18<00:00,  2.69it/s]\n",
+      "Training Step: 24: 100%|██████████| 12/12 [02:15<00:00, 11.26s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.86 <= 0.86, revert\n",
+      "Saved ckpt to /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
+      "Training time: 359.32386112213135s\n",
+      "ckpt_file: /root/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(debug=False, max_steps=12, strategy=\"constrained\",\n",
+    "      raw_shots=0, bootstrap_shots=1,\n",
+    "      exclude_input_fields_from_bootstrap_demos=True\n",
+    "      )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KAyFhzrG_J4l"
+   },
+   "source": [
+    "Here is our scores for each step:\n",
+    "\n",
+    "\"val_scores\": [\n",
+    "        0.8,\n",
+    "        0.8,\n",
+    "        0.8,\n",
+    "        0.8,\n",
+    "        0.8,\n",
+    "        0.84,\n",
+    "        0.84,\n",
+    "        0.84,\n",
+    "        0.84,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86,\n",
+    "        0.86\n",
+    "    ]\n",
+    "\n",
+    "  \"test_scores\": [\n",
+    "        0.83,\n",
+    "        0.83,\n",
+    "        0.83,\n",
+    "        0.83,\n",
+    "        0.83,\n",
+    "        0.91,\n",
+    "        0.91,\n",
+    "        0.91,\n",
+    "        0.91,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87,\n",
+    "        0.87\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "It is normal when the score of the validation does not exactly match to that of the test set. You can also train with just the test set. You can modify the fit arguments as\n",
+    "\n",
+    "```\n",
+    "trainer.fit(\n",
+    "        train_dataset=train_dataset,\n",
+    "        val_dataset=test_dataset,\n",
+    "        # test_dataset=test_dataset,\n",
+    "        debug=debug,\n",
+    "        resume_from_ckpt=resume_from_ckpt,\n",
+    "    )\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "953BV81y0JFv"
+   },
+   "source": [
+    "# 🔥 Resume Checkpoint\n",
+    "\n",
+    "We might want to continue from the earlier step and to train more steps\n",
+    "\n",
+    "This is easy to do.\n",
+    "\n",
+    "**Note: Ensure you copy the path you had, and replace it, as your run might create a different file name.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "kde1V1AE7Ty0",
+    "outputId": "52d69b69-0a3a-4780-ca26-25956cc023c7"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
+      "ObjectCountAdalComponent(\n",
+      "  eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
+      "  (task): ObjectCountTaskPipeline(\n",
+      "    (llm_counter): Generator(\n",
+      "      model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "      (prompt): Prompt(\n",
+      "        template: <START_OF_SYSTEM_PROMPT>\n",
+      "        {{system_prompt}}\n",
+      "        {# Few shot demos #}\n",
+      "        {% if few_shot_demos is not none %}\n",
+      "        Here are some examples:\n",
+      "        {{few_shot_demos}}\n",
+      "        {% endif %}\n",
+      "        <END_OF_SYSTEM_PROMPT>\n",
+      "        <START_OF_USER>\n",
+      "        {{input_str}}\n",
+      "        <END_OF_USER>\n",
+      "        , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "      )\n",
+      "      (model_client): OpenAIClient()\n",
+      "      (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "    )\n",
+      "  )\n",
+      "  (loss_fn): EvalFnToTextLoss()\n",
+      ")\n",
+      "Trainer(\n",
+      "  (adaltask): ObjectCountAdalComponent(\n",
+      "    eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
+      "    (task): ObjectCountTaskPipeline(\n",
+      "      (llm_counter): Generator(\n",
+      "        model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "        (prompt): Prompt(\n",
+      "          template: <START_OF_SYSTEM_PROMPT>\n",
+      "          {{system_prompt}}\n",
+      "          {# Few shot demos #}\n",
+      "          {% if few_shot_demos is not none %}\n",
+      "          Here are some examples:\n",
+      "          {{few_shot_demos}}\n",
+      "          {% endif %}\n",
+      "          <END_OF_SYSTEM_PROMPT>\n",
+      "          <START_OF_USER>\n",
+      "          {{input_str}}\n",
+      "          <END_OF_USER>\n",
+      "          , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "        )\n",
+      "        (model_client): OpenAIClient()\n",
+      "        (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "      )\n",
+      "    )\n",
+      "    (loss_fn): EvalFnToTextLoss()\n",
+      "  )\n",
+      ")\n",
+      "raw_shots: 0, bootstrap_shots: 1\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator configured.\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Backward engine configured for all generators.\n",
+      "Restoring prompts: PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True)\n",
+      "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 27:   0%|          | 0/13 [00:00<?, ?it/s]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 417.64it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 1073.40it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 571.14it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1126.21it/s]\n",
+      "Training Step: 28:   0%|          | 0/13 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping batch 0 as acc: 1.0\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 604.56it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.83it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 540.00it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1445.81it/s]\n",
+      "Training Step: 29:  15%|█▌        | 2/13 [00:02<00:12,  1.15s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping batch 1 as acc: 1.0\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 318.87it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00,  4.06it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 458.88it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1186.26it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.75\n",
+      "Moving batch correct size: 3\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "Subset loss backward time: 4.518843650817871\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 142.52it/s]\n",
+      "Evaluating step(2): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  1.56it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:11,  2.99s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities specified in the input. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 118.95it/s]\n",
+      "Evaluating step(2): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  1.76it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:05<00:08,  2.85s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 129.26it/s]\n",
+      "Evaluating step(2): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  1.54it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 1.0 > 0.6666666666666666\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 263.51it/s]\n",
+      "Evaluating step(2): 1.0 across 4 samples, Max potential: 1.0: 100%|██████████| 4/4 [00:00<00:00,  4.20it/s]\n",
+      "Proposing:  40%|████      | 2/5 [00:10<00:15,  5.11s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 1.0 >= 0.75\n",
+      "Done with proposals\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2287.37it/s]\n",
+      "Evaluating step(29): 0.8158 across 38 samples, Max potential: 0.86:  76%|███████▌  | 38/50 [00:17<00:05,  2.17it/s]\n",
+      "Training Step: 30:  23%|██▎       | 3/13 [00:35<02:25, 14.59s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.7948717948717948 <= 0.86\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 268.93it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.69it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 603.76it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 8825.47it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 7\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.875\n",
+      "Moving batch correct size: 7\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "Subset loss backward time: 2.2182435989379883\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure you account for all items. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 160.12it/s]\n",
+      "Evaluating step(3): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  1.72it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:11,  2.83s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure each item is counted correctly. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 89.23it/s]\n",
+      "Evaluating step(3): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 1.0 > 0.6666666666666666\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 8/8 [00:00<00:00, 281.73it/s]\n",
+      "Evaluating step(3): 1.0 across 8 samples, Max potential: 1.0: 100%|██████████| 8/8 [00:02<00:00,  2.96it/s]\n",
+      "Proposing:  20%|██        | 1/5 [00:08<00:34,  8.54s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 1.0 >= 0.875\n",
+      "Done with proposals\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1910.10it/s]\n",
+      "Evaluating step(30): 0.72 across 25 samples, Max potential: 0.86:  50%|█████     | 25/50 [00:18<00:18,  1.38it/s]\n",
+      "Training Step: 31:  31%|███       | 4/13 [01:05<03:03, 20.39s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.6923076923076923 <= 0.86\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 310.31it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.75it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 454.32it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12336.19it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 11\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.9166666666666666\n",
+      "Moving batch correct size: 11\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "Subset loss backward time: 2.028568983078003\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure each item is counted correctly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 121.52it/s]\n",
+      "Evaluating step(4): 1.0 across 3 samples, Max potential: 1.0: 100%|██████████| 3/3 [00:01<00:00,  2.10it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 1.0 > 0.6666666666666666\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 12/12 [00:00<00:00, 724.90it/s]\n",
+      "Evaluating step(4): 1.0 across 12 samples, Max potential: 1.0: 100%|██████████| 12/12 [00:03<00:00,  3.66it/s]\n",
+      "Proposing:   0%|          | 0/5 [00:05<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 1.0 >= 0.9166666666666666\n",
+      "Done with proposals\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2233.56it/s]\n",
+      "Evaluating step(31): 0.8511 across 47 samples, Max potential: 0.86:  94%|█████████▍| 47/50 [00:16<00:01,  2.81it/s]\n",
+      "Training Step: 32:  38%|███▊      | 5/13 [01:31<02:58, 22.30s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.8333333333333334 <= 0.86\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 269.31it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.20it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 606.49it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1212.58it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 15\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.9375\n",
+      "Moving batch correct size: 15\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "Subset loss backward time: 3.2150633335113525\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 130.57it/s]\n",
+      "Evaluating step(5): 0.5 across 2 samples, Max potential: 0.6667:  33%|███▎      | 1/3 [00:01<00:02,  1.39s/it]INFO:backoff:Backing off call(...) for 0.2s (openai.InternalServerError: <html>\n",
+      "<head><title>500 Internal Server Error</title></head>\n",
+      "<body>\n",
+      "<center><h1>500 Internal Server Error</h1></center>\n",
+      "<hr><center>nginx</center>\n",
+      "</body>\n",
+      "</html>)\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:50<00:00, 16.89s/it]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:52<03:28, 52.11s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 645.05it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 298.94it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:53<01:07, 22.46s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 751.40it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 360.88it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:54<00:25, 12.66s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 332.13it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 276.08it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:55<00:08,  8.12s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 440.13it/s]\n",
+      "Evaluating step(5): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 235.96it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:57<00:00, 11.41s/it]\n",
+      "Training Step: 33:  46%|████▌     | 6/13 [02:33<04:07, 35.35s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 317.05it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 676.47it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 543.36it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1518.44it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 18\n",
+      "Moving batch error size: 2\n",
+      "Moving batch acc: 0.9\n",
+      "Moving batch correct size: 18\n",
+      "Moving batch error size: 2\n",
+      "Subset Error size: 2\n",
+      "Subset Correct size: 4\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "Subset loss backward time: 7.857504606246948\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 282.66it/s]\n",
+      "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:02<00:00,  2.75it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:03<00:13,  3.26s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 687.22it/s]\n",
+      "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 539.26it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:04<00:06,  2.16s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 608.62it/s]\n",
+      "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 246.48it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:05<00:03,  1.68s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 417.60it/s]\n",
+      "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 422.96it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:07<00:01,  1.58s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 464.91it/s]\n",
+      "Evaluating step(6): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 269.93it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:08<00:00,  1.67s/it]\n",
+      "Training Step: 34:  54%|█████▍    | 7/13 [02:49<02:55, 29.23s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 104.68it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.42it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 556.85it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14230.04it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 22\n",
+      "Moving batch error size: 2\n",
+      "Moving batch acc: 0.9\n",
+      "Moving batch correct size: 18\n",
+      "Moving batch error size: 2\n",
+      "Subset Error size: 2\n",
+      "Subset Correct size: 4\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "Subset loss backward time: 6.2225048542022705\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 584.16it/s]\n",
+      "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:01<00:00,  4.41it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:10,  2.54s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 943.25it/s]\n",
+      "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 367.37it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.65s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 802.76it/s]\n",
+      "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 290.57it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.44s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 736.81it/s]\n",
+      "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 352.92it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:05<00:01,  1.31s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 596.84it/s]\n",
+      "Evaluating step(7): 0.5 across 6 samples, Max potential: 0.5: 100%|██████████| 6/6 [00:00<00:00, 250.75it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.45s/it]\n",
+      "Training Step: 35:  62%|██████▏   | 8/13 [03:04<02:04, 24.82s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.5 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 70.79it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.78it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 388.55it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2027.46it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 22\n",
+      "Moving batch error size: 2\n",
+      "Moving batch acc: 0.9\n",
+      "Moving batch correct size: 18\n",
+      "Moving batch error size: 2\n",
+      "Subset Error size: 2\n",
+      "Subset Correct size: 4\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "Subset loss backward time: 5.618266582489014\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 304.00it/s]\n",
+      "Evaluating step(8): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:02<00:00,  2.79it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:03<00:13,  3.44s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly and ensure each item is counted correctly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 128.97it/s]\n",
+      "Evaluating step(8): 1.0 across 6 samples, Max potential: 1.0: 100%|██████████| 6/6 [00:01<00:00,  3.62it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass subset check: 1.0 > 0.6666666666666666\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 20/20 [00:00<00:00, 649.93it/s]\n",
+      "Evaluating step(8): 0.95 across 20 samples, Max potential: 0.95: 100%|██████████| 20/20 [00:02<00:00,  8.93it/s]\n",
+      "Proposing:  20%|██        | 1/5 [00:08<00:35,  8.79s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass full check: 0.95 >= 0.9\n",
+      "Done with proposals\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2667.62it/s]\n",
+      "Evaluating step(35): 0.8511 across 47 samples, Max potential: 0.86:  94%|█████████▍| 47/50 [00:00<00:00, 559.52it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.8333333333333334 <= 0.86\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 36:  69%|██████▉   | 9/13 [03:21<01:29, 22.39s/it]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 154.85it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.33it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 610.06it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1798.78it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 22\n",
+      "Moving batch error size: 2\n",
+      "Moving batch acc: 0.95\n",
+      "Moving batch correct size: 19\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "Subset loss backward time: 2.553833246231079\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 228.47it/s]\n",
+      "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:01<00:00,  2.44it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:09,  2.47s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 700.57it/s]\n",
+      "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 207.56it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:03<00:05,  1.69s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 782.91it/s]\n",
+      "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 712.51it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.49s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 269.05it/s]\n",
+      "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 266.32it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:06<00:01,  1.40s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 466.64it/s]\n",
+      "Evaluating step(9): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 498.14it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.48s/it]\n",
+      "Training Step: 37:  77%|███████▋  | 10/13 [03:33<00:56, 18.97s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 115.54it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.77it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 561.81it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1002.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 23\n",
+      "Moving batch error size: 1\n",
+      "Moving batch acc: 0.95\n",
+      "Moving batch correct size: 19\n",
+      "Moving batch error size: 1\n",
+      "Subset Error size: 1\n",
+      "Subset Correct size: 2\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "Subset loss backward time: 2.35148024559021\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 139.22it/s]\n",
+      "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00,  3.95it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:01<00:07,  1.81s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 277.60it/s]\n",
+      "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 561.39it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:02<00:04,  1.42s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 736.01it/s]\n",
+      "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 168.63it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:03<00:02,  1.24s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 441.77it/s]\n",
+      "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 518.09it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:05<00:01,  1.19s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 396.70it/s]\n",
+      "Evaluating step(10): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 199.84it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:06<00:00,  1.27s/it]\n",
+      "Training Step: 38:  85%|████████▍ | 11/13 [03:43<00:32, 16.20s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 138.49it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00,  6.41it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 610.01it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10665.74it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Moving batch correct size: 22\n",
+      "Moving batch error size: 2\n",
+      "Moving batch acc: 0.9\n",
+      "Moving batch correct size: 18\n",
+      "Moving batch error size: 2\n",
+      "Subset Error size: 2\n",
+      "Subset Correct size: 4\n",
+      "Subset score: 0.6666666666666666\n",
+      "Subset batch acc: 0.6666666666666666\n",
+      "Subset loss backward...\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "Subset loss backward time: 11.797855138778687\n",
+      "Optimizer propose...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proposing:   0%|          | 0/5 [00:00<?, ?it/s]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 221.09it/s]\n",
+      "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:01<00:00,  4.45it/s]\n",
+      "\n",
+      "Proposing:  20%|██        | 1/5 [00:02<00:09,  2.46s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 690.80it/s]\n",
+      "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 309.16it/s]\n",
+      "\n",
+      "Proposing:  40%|████      | 2/5 [00:03<00:04,  1.61s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 488.13it/s]\n",
+      "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 365.81it/s]\n",
+      "\n",
+      "Proposing:  60%|██████    | 3/5 [00:04<00:02,  1.36s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 693.52it/s]\n",
+      "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 272.61it/s]\n",
+      "\n",
+      "Proposing:  80%|████████  | 4/5 [00:05<00:01,  1.35s/it]\u001b[A"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "New prompts:  [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Pay special attention to quantities mentioned explicitly, including multiples. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Loading Data: 100%|██████████| 6/6 [00:00<00:00, 767.58it/s]\n",
+      "Evaluating step(11): 0.6667 across 6 samples, Max potential: 0.6667: 100%|██████████| 6/6 [00:00<00:00, 719.89it/s]\n",
+      "\n",
+      "Proposing: 100%|██████████| 5/5 [00:07<00:00,  1.41s/it]\n",
+      "Training Step: 38:  92%|█████████▏| 12/13 [04:02<00:20, 20.21s/it]\n",
+      "Epoch: 100%|██████████| 1/1 [04:02<00:00, 242.58s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n",
+      "Done with proposals\n",
+      "No proposal can improve the subset and full set, go to next step\n",
+      "Reached max steps\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      ")\n",
+      "Teacher generator configured.\n",
+      "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
+      "Starting step: 38\n",
+      "trainer_results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 39:   0%|          | 0/12 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 39\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 161.31it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 54e272c5-1360-462e-b773-4c58c61472ee already exists. Updating the trace.\n",
+      "Trace with id 0e8910c8-703d-4766-a483-c5691125fd03 already exists. Updating the trace.\n",
+      "Trace with id 6c0d3a9a-bb01-4fb3-a68b-1edf66861235 already exists. Updating the trace.\n",
+      "Trace with id fe9b883c-4f47-44f7-a388-b03a2fb10413 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 812.53it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 2283.86it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11023.14it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 294.28it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.11it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 485.47it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11015.90it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['0e8910c8-703d-4766-a483-c5691125fd03']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Fridge\\n\\n  2. Chair\\n\\n  3. Bed\\n\\n  4. Oven\\n\\n  5. Microwave\\n\\n  6. Car\\n\\n\\n  There are 6 objects in total.\\n\\n\\n  Answer: 6'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3194.64it/s]\n",
+      "Evaluating step(39): 0.6818 across 22 samples, Max potential: 0.86:  44%|████▍     | 22/50 [00:15<00:19,  1.45it/s]\n",
+      "Training Step: 40:  17%|█▋        | 2/12 [00:17<02:58, 17.85s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.6521739130434783 <= 0.86, revert\n",
+      "Training Step: 40\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 697.57it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id daa5804f-1aad-4f01-b26c-6b31c57f065f already exists. Updating the trace.\n",
+      "Trace with id 71d549d2-9cc8-46ba-a7f6-d07f69263fd3 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 562.43it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id b538075d-01af-4b76-b835-9005f3044609 already exists. Updating the trace.Trace with id fd34672a-ffd1-498e-a88f-283aa9d4f65d already exists. Updating the trace.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 577.17it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9709.04it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 142.07it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.41it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 311.77it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 713.44it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3973.84it/s]\n",
+      "Evaluating step(40): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 440.54it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 41:  33%|███▎      | 4/12 [00:22<00:49,  6.19s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 41\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 155.20it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 840d9ed5-8222-45a9-a406-7445feae9733 already exists. Updating the trace.\n",
+      "Trace with id 4cd9f4ec-2648-4e85-8e17-3dae1b8558d3 already exists. Updating the trace.\n",
+      "Trace with id ac43f3d4-d67d-4912-95d6-0baa09b52d9a already exists. Updating the trace.\n",
+      "Trace with id 1eb770ed-ff6f-481e-8c16-b9749a44a1a6 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 1098.13it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 521.96it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10292.77it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 172.25it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.39it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 587.31it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1397.05it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3444.16it/s]\n",
+      "Evaluating step(41): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 318.28it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 42:  42%|████▏     | 5/12 [00:24<00:25,  3.71s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 42\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 268.35it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 4dad0f65-d624-48c2-a795-596c00b0535a already exists. Updating the trace.\n",
+      "Trace with id dd9d8748-4926-4bcd-902d-6a4c5cb38267 already exists. Updating the trace.\n",
+      "Trace with id 1f682cab-026c-4803-8018-a45d027aa026 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 522.44it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 344.49it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14755.69it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.06it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.03it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 454.94it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 5319.35it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4718.96it/s]\n",
+      "Evaluating step(42): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 404.64it/s]\n",
+      "Training Step: 43:  58%|█████▊    | 7/12 [00:27<00:17,  3.51s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n",
+      "Training Step: 43\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 261.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 46a8994f-fce6-4031-b251-1c8af31d88d2 already exists. Updating the trace.Trace with id 3a9a47c8-a210-43a4-8d24-b9159babb6e4 already exists. Updating the trace.\n",
+      "\n",
+      "Trace with id 234e39df-1bc4-41df-a515-895cb2614a53 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 428.10it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id de4e75d6-a21b-4004-925d-a9a818bd0f7c already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 296.10it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11374.38it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 239.89it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.62it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 447.30it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 475.76it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4960.15it/s]\n",
+      "Evaluating step(43): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 464.52it/s]\n",
+      "Training Step: 44:  67%|██████▋   | 8/12 [00:30<00:10,  2.51s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n",
+      "Training Step: 44\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 237.83it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id aefd17e5-9682-4420-a820-c484a63d6dcd already exists. Updating the trace.\n",
+      "Trace with id 2bc992c0-9832-47f1-87c3-9f6e4b18ee99 already exists. Updating the trace.Trace with id 945f82c7-03d9-4f49-8267-be7abac2bce6 already exists. Updating the trace.\n",
+      "Trace with id 12a6ff3d-f54d-4d89-b5f0-1aec30e96398 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 1138.91it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 394.77it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 443.51it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 247.66it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.52it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 373.33it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 830.43it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['aefd17e5-9682-4420-a820-c484a63d6dcd']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each vegetable step by step:\\n\\n\\n  1. Carrot: 1\\n\\n  2. Onion: 1\\n\\n  3. Stalk of celery: 1\\n\\n  4. Yams: 3\\n\\n  5. Garlic: 1\\n\\n  6. Head of broccoli: 1\\n\\n  7. Potato: 1\\n\\n\\n  Now, let''s add them up:\\n\\n\\n  1 + 1 + 1 + 3 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1197.95it/s]\n",
+      "Evaluating step(44): 0.8333 across 42 samples, Max potential: 0.86:  84%|████████▍ | 42/50 [00:22<00:04,  1.87it/s]\n",
+      "Training Step: 45:  75%|███████▌  | 9/12 [00:57<00:24,  8.31s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.813953488372093 <= 0.86, revert\n",
+      "Training Step: 45\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 164.91it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 701be0ee-29e0-42f5-be04-72d2b73e3968 already exists. Updating the trace.\n",
+      "Trace with id e2bfbbe0-fb79-4df5-9a7d-50c9085947bc already exists. Updating the trace.\n",
+      "Trace with id d4194dd1-739a-4509-8ac8-7c3f89649ee7 already exists. Updating the trace.\n",
+      "Trace with id 1b4b3ab0-d20f-4fc2-a09c-4592a227a8e5 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 731.86it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 244.23it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 395.27it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 140.54it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.11it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 448.16it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 658.37it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2621.44it/s]\n",
+      "Evaluating step(45): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 306.53it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 46:  92%|█████████▏| 11/12 [00:59<00:06,  6.78s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 46\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 256.89it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 82cf82ff-d826-4bb1-847c-9938aeec8ff5 already exists. Updating the trace.\n",
+      "Trace with id 5124e2e6-2aac-4dd3-ab63-9277a7b806a7 already exists. Updating the trace.Trace with id a9a202f5-e723-4d24-ae5e-ad1084a52ef8 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 426.47it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d3f33ded-170a-4b87-9b0b-987d5fb7b817 already exists. Updating the trace.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 266.65it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 380.40it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 251.95it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.75it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 411.12it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 511.05it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['aefd17e5-9682-4420-a820-c484a63d6dcd']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each vegetable step by step:\\n\\n\\n  1. Carrot: 1\\n\\n  2. Onion: 1\\n\\n  3. Stalk of celery: 1\\n\\n  4. Yams: 3\\n\\n  5. Garlic: 1\\n\\n  6. Head of broccoli: 1\\n\\n  7. Potato: 1\\n\\n\\n  Now, let''s add them up:\\n\\n\\n  1 + 1 + 1 + 3 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4016.92it/s]\n",
+      "Evaluating step(46): 0.8333 across 42 samples, Max potential: 0.86:  84%|████████▍ | 42/50 [00:00<00:00, 303.81it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.813953488372093 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 47: 100%|██████████| 12/12 [01:01<00:00,  4.42s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 47\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 96.23it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 85d63f78-39c0-4753-a9fc-52202df48673 already exists. Updating the trace.Trace with id 74d1bc97-46cd-406d-8c3a-2f999aae1b2f already exists. Updating the trace.\n",
+      "\n",
+      "Trace with id 60866bed-8020-4610-a39a-a4a730c035db already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 341.47it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 1d3eceeb-ad24-40f6-8752-2f38241172cb already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 167.75it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 846.95it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 136.09it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:04<00:00,  1.03s/it]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 191.47it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 923.91it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2836.52it/s]\n",
+      "Evaluating step(47): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 371.59it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 48: : 13it [01:07,  4.63s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 48\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 189.96it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 96c716a1-e984-4fe3-9ce0-e156ac709edb already exists. Updating the trace.\n",
+      "Trace with id 3835ee47-6951-49ec-b285-621fc1085024 already exists. Updating the trace.Trace with id 99607986-e107-46b8-b86b-177b295983c4 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 295.41it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Trace with id d46e538c-832d-4eb5-ba9b-a308f666baba already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 161.24it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1621.93it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 153.47it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.07it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 207.08it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 344.25it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s list and count the vegetables mentioned:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (two cabbages)\\n\\n  4. Garlic\\n\\n  5. Carrot\\n\\n  6. Broccoli (head of broccoli)\\n\\n  7. Potato\\n\\n  8. Celery (stalk of celery)\\n\\n  9. Lettuce (lettuce head)\\n\\n\\n  Now, let''s count each vegetable:\\n\\n\\n  1. Yam: 1\\n\\n  2. Cauliflower: 1\\n\\n  3. Cabbages: 2\\n\\n  4. Garlic: 1\\n\\n  5. Carrot: 1\\n\\n  6. Broccoli: 1\\n\\n  7. Potato: 1\\n\\n  8. Celery: 1\\n\\n  9. Lettuce: 1\\n\\n\\n  Adding them up:\\n\\n\\n  1 + 1 + 2 + 1 + 1 + 1 + 1 + 1 + 1 = 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1430.74it/s]\n",
+      "Evaluating step(48): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:28<00:07,  1.41it/s]\n",
+      "Training Step: 49: : 14it [01:39, 11.59s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n",
+      "Training Step: 49\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 122.71it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id ffe67a7b-7b81-4302-b6ed-4b506570274b already exists. Updating the trace.\n",
+      "Trace with id 8895d6bd-eab0-48af-ad4b-51f8007258b1 already exists. Updating the trace.\n",
+      "Trace with id d71ad721-d21d-42f1-af9b-719ff026406b already exists. Updating the trace.Trace with id e250f80e-334e-4f85-ac1f-df9a2013d578 already exists. Updating the trace.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 421.38it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 121.46it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1767.14it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 166.47it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.02it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 206.20it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 983.31it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3168.14it/s]\n",
+      "Evaluating step(49): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 492.44it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 50: : 16it [01:42,  9.33s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 50\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 108.30it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id c42fea48-1b90-4388-92c4-b65b4356a3a2 already exists. Updating the trace.\n",
+      "Trace with id 660c5004-35d2-4a6d-9a06-1e0b3f032f21 already exists. Updating the trace.\n",
+      "Trace with id 7694df14-3a24-40bd-a3fa-036c2645eca3 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 220.83it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1212.75it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 90.57it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.12it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 208.93it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1002.82it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['fd34672a-ffd1-498e-a88f-283aa9d4f65d']\n",
+      "New prompts: [PromptData(id='a530c025-f25c-4423-b146-215ff73586f4', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='0b4dc918-1afb-4f03-9193-90ec51a9abab', name='llm_counter.few_shot_demos', data=\"Example: 'Let''s count each item step by step:\\n\\n\\n  1. Microwave: 1\\n\\n  2. Lamp: 1\\n\\n  3. Cars: 4\\n\\n  4. Stove: 1\\n\\n  5. Toaster: 1\\n\\n  6. Bed: 1\\n\\n\\n  Now, add them all together:\\n\\n\\n  1 + 1 + 4 + 1 + 1 + 1 = 9\\n\\n\\n  Answer: 9'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2261.91it/s]\n",
+      "Evaluating step(50): 0.825 across 40 samples, Max potential: 0.86:  80%|████████  | 40/50 [00:00<00:00, 281.78it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.8048780487804879 <= 0.86, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 50: 100%|██████████| 12/12 [01:49<00:00,  9.15s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saved ckpt to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
+      "Training time: 352.5873613357544s\n",
+      "ckpt_file: /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "ckpt_path = \"/content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\"\n",
+    "\n",
+    "train(debug=False, max_steps=12, strategy=\"constrained\",\n",
+    "                raw_shots=0, bootstrap_shots=1,\n",
+    "                resume_from_ckpt=ckpt_path,\n",
+    "                exclude_input_fields_from_bootstrap_demos=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "m5fZGQqLE78r"
+   },
+   "source": [
+    "I decide to try more, this time, using strategy \"random\". And in the bootstrap demo, there is one shot, but I ensure I also add the \"input\" in the demonstration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "78JAv4ULEn07",
+    "outputId": "e87bb360-fc26-4dbd-d163-86ab32c292df"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:adalflow.core.generator:Error copying the prompt_kwargs: 'prompt' is not a valid ParameterType\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-3.5-turbo.db\n",
+      "ObjectCountAdalComponent(\n",
+      "  eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
+      "  (task): ObjectCountTaskPipeline(\n",
+      "    (llm_counter): Generator(\n",
+      "      model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "      (prompt): Prompt(\n",
+      "        template: <START_OF_SYSTEM_PROMPT>\n",
+      "        {{system_prompt}}\n",
+      "        {# Few shot demos #}\n",
+      "        {% if few_shot_demos is not none %}\n",
+      "        Here are some examples:\n",
+      "        {{few_shot_demos}}\n",
+      "        {% endif %}\n",
+      "        <END_OF_SYSTEM_PROMPT>\n",
+      "        <START_OF_USER>\n",
+      "        {{input_str}}\n",
+      "        <END_OF_USER>\n",
+      "        , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "      )\n",
+      "      (model_client): OpenAIClient()\n",
+      "      (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "    )\n",
+      "  )\n",
+      "  (loss_fn): EvalFnToTextLoss()\n",
+      ")\n",
+      "Trainer(\n",
+      "  (adaltask): ObjectCountAdalComponent(\n",
+      "    eval_fn: compute_single_item, backward_engine: None, backward_engine_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, teacher_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, text_optimizer_model_config: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}\n",
+      "    (task): ObjectCountTaskPipeline(\n",
+      "      (llm_counter): Generator(\n",
+      "        model_kwargs={'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "        (prompt): Prompt(\n",
+      "          template: <START_OF_SYSTEM_PROMPT>\n",
+      "          {{system_prompt}}\n",
+      "          {# Few shot demos #}\n",
+      "          {% if few_shot_demos is not none %}\n",
+      "          Here are some examples:\n",
+      "          {{few_shot_demos}}\n",
+      "          {% endif %}\n",
+      "          <END_OF_SYSTEM_PROMPT>\n",
+      "          <START_OF_USER>\n",
+      "          {{input_str}}\n",
+      "          <END_OF_USER>\n",
+      "          , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': None}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "        )\n",
+      "        (model_client): OpenAIClient()\n",
+      "        (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "      )\n",
+      "    )\n",
+      "    (loss_fn): EvalFnToTextLoss()\n",
+      "  )\n",
+      ")\n",
+      "raw_shots: 0, bootstrap_shots: 1\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator configured.\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Backward engine configured for all generators.\n",
+      "Restoring prompts: PromptData(id='44f6083f-4cf7-4a9a-bf10-20d218ee4106', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True)\n",
+      "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 51:   0%|          | 0/13 [00:00<?, ?it/s]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 415.27it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 224.54it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 423.57it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10894.30it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2199.38it/s]\n",
+      "Evaluating step(51): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:12<00:00,  3.97it/s]\n",
+      "Training Step: 52:   8%|▊         | 1/13 [00:18<03:38, 18.20s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.86 <= 0.86\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 402.10it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 785.01it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 842.02it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 6660.27it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1760.33it/s]\n",
+      "Evaluating step(52): 0.86 across 50 samples, Max potential: 0.86: 100%|██████████| 50/50 [00:12<00:00,  3.96it/s]\n",
+      "Training Step: 53:  15%|█▌        | 2/13 [00:36<03:21, 18.28s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.86 <= 0.86\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 571.26it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:00<00:00, 988.41it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 608.29it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1177.76it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2074.29it/s]\n",
+      "Evaluating step(53): 0.88 across 50 samples, Max potential: 0.88: 100%|██████████| 50/50 [00:16<00:00,  3.07it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer step: 0.88 > 0.86\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 5848.08it/s]\n",
+      "Evaluating step(53): 0.9 across 100 samples, Max potential: 0.9: 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]\n",
+      "Training Step: 54:  23%|██▎       | 3/13 [01:28<05:35, 33.51s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 297.78it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.95it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 407.40it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 8952.62it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1274.72it/s]\n",
+      "Evaluating step(54): 0.94 across 50 samples, Max potential: 0.94: 100%|██████████| 50/50 [00:16<00:00,  3.06it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer step: 0.94 > 0.88\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 6831.78it/s]\n",
+      "Evaluating step(54): 0.91 across 100 samples, Max potential: 0.91: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]\n",
+      "Training Step: 55:  31%|███       | 4/13 [02:21<06:10, 41.21s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 152.84it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 688.86it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1318.45it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data='You will answer a reasoning question. Carefully count each item and verify your total. List each item individually, ensuring each is counted as \"1\" regardless of quantity mentioned. Show your calculations step by step. The last line of your response should be: \\'Answer: $VALUE\\' where VALUE is a numerical value.', requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2011.16it/s]\n",
+      "Evaluating step(55): 0.8696 across 23 samples, Max potential: 0.94:  46%|████▌     | 23/50 [00:15<00:17,  1.52it/s]\n",
+      "Training Step: 56:  38%|███▊      | 5/13 [02:46<04:43, 35.43s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.8333333333333334 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.66it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 646.55it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 2217.45it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4572.35it/s]\n",
+      "Evaluating step(56): 0.94 across 50 samples, Max potential: 0.94: 100%|██████████| 50/50 [00:00<00:00, 390.77it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.94 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 57:  46%|████▌     | 6/13 [02:54<03:02, 26.03s/it]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 145.48it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.52it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 375.76it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1437.76it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 0.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check for any grouped items and count them correctly. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1614.47it/s]\n",
+      "Evaluating step(57): 0.7857 across 14 samples, Max potential: 0.94:  28%|██▊       | 14/50 [00:19<00:50,  1.41s/it]\n",
+      "Training Step: 58:  54%|█████▍    | 7/13 [03:23<02:42, 27.04s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.7333333333333333 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 137.96it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.94it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 806.79it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 11522.81it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be formatted as: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3560.17it/s]\n",
+      "Evaluating step(58): 0.88 across 25 samples, Max potential: 0.94:  50%|█████     | 25/50 [00:17<00:17,  1.45it/s]\n",
+      "Training Step: 59:  62%|██████▏   | 8/13 [03:47<02:10, 26.06s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.8461538461538461 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.90it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.70it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 552.01it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 5648.89it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 0.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1770.11it/s]\n",
+      "Evaluating step(59): 0.9286 across 42 samples, Max potential: 0.94:  84%|████████▍ | 42/50 [00:16<00:03,  2.49it/s]\n",
+      "Training Step: 60:  69%|██████▉   | 9/13 [04:13<01:43, 26.00s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.9069767441860465 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 314.86it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.10it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 722.53it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 7940.00it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count for precision. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 7188.43it/s]\n",
+      "Evaluating step(60): 0.8966 across 29 samples, Max potential: 0.94:  58%|█████▊    | 29/50 [00:15<00:11,  1.84it/s]\n",
+      "Training Step: 61:  77%|███████▋  | 10/13 [04:35<01:14, 24.87s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.8666666666666667 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 95.68it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.74it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 587.05it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 12520.31it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3036.62it/s]\n",
+      "Evaluating step(61): 0.9286 across 42 samples, Max potential: 0.94:  84%|████████▍ | 42/50 [00:00<00:00, 327.89it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.9069767441860465 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 62:  85%|████████▍ | 11/13 [04:44<00:40, 20.14s/it]\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 136.40it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  3.17it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 417.11it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14339.50it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loss backward...\n",
+      "setting pred name Generator_outputy_pred_2 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_0 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_3 score to 1.0\n",
+      "setting pred name Generator_outputy_pred_1 score to 1.0\n",
+      "Optimizer propose...\n",
+      "New prompts:  [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. Double-check your final count. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=None, requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 5035.06it/s]\n",
+      "Evaluating step(62): 0.9286 across 42 samples, Max potential: 0.94:  84%|████████▍ | 42/50 [00:00<00:00, 327.19it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimizer revert: 0.9069767441860465 <= 0.94\n",
+      "Saving checkpoint to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 62:  92%|█████████▏| 12/13 [04:51<00:24, 24.28s/it]\n",
+      "Epoch:   0%|          | 0/1 [04:51<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reached max steps\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    {{system_prompt}}\n",
+      "    {# Few shot demos #}\n",
+      "    {% if few_shot_demos is not none %}\n",
+      "    Here are some examples:\n",
+      "    {{few_shot_demos}}\n",
+      "    {% endif %}\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <START_OF_USER>\n",
+      "    {{input_str}}\n",
+      "    <END_OF_USER>\n",
+      "    , prompt_kwargs: {'system_prompt': \"You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\", 'few_shot_demos': 'None'}, prompt_variables: ['input_str', 'few_shot_demos', 'system_prompt']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (output_processors): ParseIntegerAnswerComponent(fun_name=parse_integer_answer)\n",
+      ")\n",
+      "cache_path: /root/.adalflow/cache_OpenAIClient_gpt-4o.db\n",
+      "Configuring teacher generator for Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      ")\n",
+      "Teacher generator set: Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      "), teacher Generator(\n",
+      "  model_kwargs={'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}, \n",
+      "  (prompt): Prompt(\n",
+      "    template: <START_OF_SYSTEM_PROMPT>\n",
+      "    You are the feedback engine in an optimization system.\n",
+      "    \n",
+      "    Your role: Provide intelligent and creative feedback for the variable enclosed in <VARIABLE></VARIABLE> tags, based on the objective specified in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.\n",
+      "    1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.\n",
+      "    2. Feedback examples: \"Since language models have the X failure mode...\", \"Adding X can fix this error because...\", \"Removing X can improve the objective function because...\", \"Changing X to Y would fix the mistake...\"\n",
+      "    3. Consider the variable in the context of its peers if provided.\n",
+      "    Remember:\n",
+      "    Be concise, critical, and direct.\n",
+      "    <END_OF_SYSTEM_PROMPT>\n",
+      "    <CONVERSATION>\n",
+      "    {{conversation_sec}}\n",
+      "    </CONVERSATION>\n",
+      "    {{objective_instruction_sec}}\n",
+      "    , prompt_variables: ['objective_instruction_sec', 'conversation_sec']\n",
+      "  )\n",
+      "  (model_client): OpenAIClient()\n",
+      ")\n",
+      "Teacher generator configured.\n",
+      "save to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
+      "Starting step: 62\n",
+      "trainer_results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 63:   0%|          | 0/12 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 63\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 175.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id fd34672a-ffd1-498e-a88f-283aa9d4f65d already exists. Updating the trace.\n",
+      "Trace with id 82cf82ff-d826-4bb1-847c-9938aeec8ff5 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 46a8994f-fce6-4031-b251-1c8af31d88d2 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00,  4.32it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 445.92it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9063.87it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 132.51it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.85it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 913.05it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1900.02it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2838.94it/s]\n",
+      "Evaluating step(63): 0.5 across 6 samples, Max potential: 0.94:  12%|█▏        | 6/50 [00:31<03:48,  5.20s/it]\n",
+      "Training Step: 64:  17%|█▋        | 2/12 [00:36<06:01, 36.20s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.42857142857142855 <= 0.94, revert\n",
+      "Training Step: 64\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 173.87it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 60866bed-8020-4610-a39a-a4a730c035db already exists. Updating the trace.\n",
+      "Trace with id 7694df14-3a24-40bd-a3fa-036c2645eca3 already exists. Updating the trace."
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 3835ee47-6951-49ec-b285-621fc1085024 already exists. Updating the trace.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:00<00:00,  4.64it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d3f33ded-170a-4b87-9b0b-987d5fb7b817 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 1138.44it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 3232.60it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 151.65it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 725.72it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10845.00it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2971.02it/s]\n",
+      "Evaluating step(64): 0.5 across 6 samples, Max potential: 0.94:  12%|█▏        | 6/50 [00:00<00:00, 136.83it/s]\n",
+      "Training Step: 65:  33%|███▎      | 4/12 [00:41<01:29, 11.21s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.42857142857142855 <= 0.94, revert\n",
+      "Training Step: 65\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 201.47it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 99607986-e107-46b8-b86b-177b295983c4 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id ffe67a7b-7b81-4302-b6ed-4b506570274b already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  50%|█████     | 2/4 [00:00<00:00,  2.54it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 71d549d2-9cc8-46ba-a7f6-d07f69263fd3 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.89it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 4cd9f4ec-2648-4e85-8e17-3dae1b8558d3 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 402.70it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 6304.85it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 218.71it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.19it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 858.52it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 768.93it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1635.33it/s]\n",
+      "Evaluating step(65): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:23<00:00,  2.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pass validation: 0.96 > 0.94\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 100/100 [00:00<00:00, 3294.35it/s]\n",
+      "Evaluating step(65): 0.95 across 100 samples, Max potential: 0.95: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]\n",
+      "Training Step: 66:  42%|████▏     | 5/12 [01:50<02:42, 23.20s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 66\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 186.04it/s]\n",
+      "Training:  25%|██▌       | 1/4 [00:00<00:02,  1.01it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id fe9b883c-4f47-44f7-a388-b03a2fb10413 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  50%|█████     | 2/4 [00:01<00:01,  1.30it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 12a6ff3d-f54d-4d89-b5f0-1aec30e96398 already exists. Updating the trace.\n",
+      "Trace with id 840d9ed5-8222-45a9-a406-7445feae9733 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.46it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 3a9a47c8-a210-43a4-8d24-b9159babb6e4 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 636.54it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 9420.11it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 111.34it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.50it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 321.28it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 731.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 1120.89it/s]\n",
+      "Evaluating step(66): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 212.00it/s]\n",
+      "Training Step: 67:  58%|█████▊    | 7/12 [01:55<01:32, 18.51s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.96 <= 0.96, revert\n",
+      "Training Step: 67\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data:   0%|          | 0/4 [00:00<?, ?it/s]\u001b[A\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 31.60it/s]\n",
+      "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 5124e2e6-2aac-4dd3-ab63-9277a7b806a7 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  25%|██▌       | 1/4 [00:01<00:05,  1.78s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id ac43f3d4-d67d-4912-95d6-0baa09b52d9a already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  75%|███████▌  | 3/4 [00:02<00:00,  1.63it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d71ad721-d21d-42f1-af9b-719ff026406b already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id c42fea48-1b90-4388-92c4-b65b4356a3a2 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 420.84it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 533.39it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 48.64it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.64it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 396.85it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 8608.11it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4202.88it/s]\n",
+      "Evaluating step(67): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 405.51it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.96 <= 0.96, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 68:  67%|██████▋   | 8/12 [02:02<00:47, 11.99s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 68\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 77.30it/s]\n",
+      "Training:   0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id d4194dd1-739a-4509-8ac8-7c3f89649ee7 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training:  75%|███████▌  | 3/4 [00:01<00:00,  2.62it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 1eb770ed-ff6f-481e-8c16-b9749a44a1a6 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.46it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 54e272c5-1360-462e-b773-4c58c61472ee already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 212.56it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10831.00it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 179.03it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.09it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 502.04it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 639.84it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3305.62it/s]\n",
+      "Evaluating step(68): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 539.54it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.96 <= 0.96, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 69:  75%|███████▌  | 9/12 [02:09<00:32, 10.69s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 69\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 84.70it/s]\n",
+      "Training:  25%|██▌       | 1/4 [00:01<00:03,  1.26s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 0e8910c8-703d-4766-a483-c5691125fd03 already exists. Updating the trace.\n",
+      "Trace with id 74d1bc97-46cd-406d-8c3a-2f999aae1b2f already exists. Updating the trace.\n",
+      "Trace with id 701be0ee-29e0-42f5-be04-72d2b73e3968 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.67it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id de4e75d6-a21b-4004-925d-a9a818bd0f7c already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 331.49it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 488.36it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 274.35it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.51it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 596.31it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14678.23it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4604.98it/s]\n",
+      "Evaluating step(69): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:00<00:00, 88.47it/s]\n",
+      "Training Step: 70:  92%|█████████▏| 11/12 [02:13<00:08,  8.97s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.4 <= 0.96, revert\n",
+      "Training Step: 70\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 169.70it/s]\n",
+      "Training:  25%|██▌       | 1/4 [00:01<00:03,  1.03s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id e2bfbbe0-fb79-4df5-9a7d-50c9085947bc already exists. Updating the trace.\n",
+      "Trace with id 6c34d6e5-0e3d-4243-834e-fd6c5883f467 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.45it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 1b4b3ab0-d20f-4fc2-a09c-4592a227a8e5 already exists. Updating the trace.\n",
+      "Trace with id aefd17e5-9682-4420-a820-c484a63d6dcd already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 285.47it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 288.20it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 262.75it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.60it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 293.12it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1091.27it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4087.46it/s]\n",
+      "Evaluating step(70): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 345.89it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.96 <= 0.96, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 71: 100%|██████████| 12/12 [02:17<00:00,  6.07s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 71\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 87.52it/s]\n",
+      "Training:  25%|██▌       | 1/4 [00:01<00:03,  1.33s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 6c0d3a9a-bb01-4fb3-a68b-1edf66861235 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  50%|█████     | 2/4 [00:01<00:01,  1.37it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 234e39df-1bc4-41df-a515-895cb2614a53 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  75%|███████▌  | 3/4 [00:01<00:00,  1.92it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 8895d6bd-eab0-48af-ad4b-51f8007258b1 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.87it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 4dad0f65-d624-48c2-a795-596c00b0535a already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 262.50it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 10369.11it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 43.64it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 321.11it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1141.46it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['99607986-e107-46b8-b86b-177b295983c4']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 4407.91it/s]\n",
+      "Evaluating step(71): 0.96 across 50 samples, Max potential: 0.96: 100%|██████████| 50/50 [00:00<00:00, 397.66it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.96 <= 0.96, revert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training Step: 72: : 13it [02:23,  6.04s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 72\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 113.31it/s]\n",
+      "Training:  25%|██▌       | 1/4 [00:00<00:02,  1.04it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 85d63f78-39c0-4753-a9fc-52202df48673 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  50%|█████     | 2/4 [00:01<00:01,  1.82it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 433650a5-ca75-4867-b235-3af4a7c55c67 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\rTraining:  75%|███████▌  | 3/4 [00:01<00:00,  2.48it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id e250f80e-334e-4f85-ac1f-df9a2013d578 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.86it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 1d3eceeb-ad24-40f6-8752-2f38241172cb already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 170.72it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 13981.01it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 195.45it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:01<00:00,  2.46it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 241.42it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 322.68it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 3970.90it/s]\n",
+      "Evaluating step(72): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:00<00:00, 96.75it/s] \n",
+      "Training Step: 73: : 14it [02:30,  6.33s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.4 <= 0.96, revert\n",
+      "Training Step: 73\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 73.23it/s]\n",
+      "Training:  25%|██▌       | 1/4 [00:01<00:05,  1.97s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id daa5804f-1aad-4f01-b26c-6b31c57f065f already exists. Updating the trace.\n",
+      "Trace with id dd9d8748-4926-4bcd-902d-6a4c5cb38267 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.80it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 04e77795-cc9b-4530-a883-5f775e3fbc76 already exists. Updating the trace.\n",
+      "Trace with id 1f682cab-026c-4803-8018-a45d027aa026 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 211.00it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1551.58it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 205.19it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.80it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 266.20it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1059.57it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['000a3738-1f09-40b0-9f8b-2dec63a3f7f8']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have two heads of broccoli, an apple, a lettuce head, and two onions.\\n  How many vegetables do I have?\\nExample: 'Let''s list each item individually and determine if it is a vegetable:\\n\\n\\n  1. Two heads of broccoli (vegetables)\\n\\n  2. An apple (not a vegetable)\\n\\n  3. A lettuce head (vegetable)\\n\\n  4. Two onions (vegetables)\\n\\n\\n  Now, let''s count the vegetables:\\n\\n\\n  1. Two heads of broccoli\\n\\n  2. One lettuce head\\n\\n  3. Two onions\\n\\n\\n  Total number of vegetables:\\n\\n  2 (broccoli) + 1 (lettuce) + 2 (onions) = 5\\n\\n\\n  Answer: 5'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 2226.42it/s]\n",
+      "Evaluating step(73): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:00<00:00, 91.13it/s]\n",
+      "Training Step: 74: : 16it [02:35,  6.09s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.4 <= 0.96, revert\n",
+      "Training Step: 74\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 93.65it/s]\n",
+      "Training:  25%|██▌       | 1/4 [00:01<00:04,  1.36s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id 945f82c7-03d9-4f49-8267-be7abac2bce6 already exists. Updating the trace.\n",
+      "Trace with id a9a202f5-e723-4d24-ae5e-ad1084a52ef8 already exists. Updating the trace.\n",
+      "Trace with id d46e538c-832d-4eb5-ba9b-a308f666baba already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.59it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trace with id b538075d-01af-4b76-b835-9005f3044609 already exists. Updating the trace.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 115.74it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 1086.11it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 225.28it/s]\n",
+      "Training: 100%|██████████| 4/4 [00:02<00:00,  1.50it/s]\n",
+      "\n",
+      "Loading Data: 100%|██████████| 4/4 [00:00<00:00, 224.88it/s]\n",
+      "Calculating Loss: 100%|██████████| 4/4 [00:00<00:00, 14242.12it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampled_augmented_demos: ['b538075d-01af-4b76-b835-9005f3044609']\n",
+      "New prompts: [PromptData(id='327b63f0-b532-435a-85d7-6137d4e52c4c', name='llm_counter.system_prompt', data=\"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\", requires_opt=True), PromptData(id='73a3953b-6351-44d8-a36f-7521db346cca', name='llm_counter.few_shot_demos', data=\"input_str: I have a blackberry, a raspberry, a peach, a head of broccoli, a plum,\\n  an orange, two bananas, a grape, two garlics, a nectarine, a lettuce head, and an\\n  apple. How many fruits do I have?\\nExample: 'Let''s list each item and identify whether it is a fruit:\\n\\n\\n  1. Blackberry - Fruit\\n\\n  2. Raspberry - Fruit\\n\\n  3. Peach - Fruit\\n\\n  4. Head of broccoli - Not a fruit\\n\\n  5. Plum - Fruit\\n\\n  6. Orange - Fruit\\n\\n  7. Two bananas - Fruits (2 bananas)\\n\\n  8. Grape - Fruit\\n\\n  9. Two garlics - Not fruits\\n\\n  10. Nectarine - Fruit\\n\\n  11. Lettuce head - Not a fruit\\n\\n  12. Apple - Fruit\\n\\n\\n  Now, let''s count the fruits:\\n\\n\\n  1. Blackberry\\n\\n  2. Raspberry\\n\\n  3. Peach\\n\\n  4. Plum\\n\\n  5. Orange\\n\\n  6. Two bananas (counted as 2)\\n\\n  7. Grape\\n\\n  8. Nectarine\\n\\n  9. Apple\\n\\n\\n  Total number of fruits:\\n\\n  1 + 1 + 1 + 1 + 1 + 2 + 1 + 1 + 1 = 10\\n\\n\\n  Answer: 10'\", requires_opt=True)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Loading Data: 100%|██████████| 50/50 [00:00<00:00, 634.93it/s]\n",
+      "Evaluating step(74): 0.5 across 4 samples, Max potential: 0.96:   8%|▊         | 4/50 [00:28<05:25,  7.07s/it]\n",
+      "Training Step: 74: 100%|██████████| 12/12 [03:12<00:00, 16.04s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fail validation: 0.4 <= 0.96, revert\n",
+      "Saved ckpt to /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n",
+      "Training time: 484.17421078681946s\n",
+      "ckpt_file: /content/adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_4e8a1_run_1.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "train(debug=False, max_steps=12, strategy=\"random\",\n",
+    "                raw_shots=0, bootstrap_shots=1,\n",
+    "                resume_from_ckpt=ckpt_path,\n",
+    "                exclude_input_fields_from_bootstrap_demos=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xTB4lO3PFPnP"
+   },
+   "source": [
+    "Finally, we got 96% on the val and 95% on the test!!! This is really close to GPT4o's performance. This took us 72 steps!\n",
+    "\n",
+    "The score is consistent, meaning this is a good prompt.\n",
+    "Here is our final optimized prompt:\n",
+    "\n",
+    "System:\n",
+    "\n",
+    "```\n",
+    "\n",
+    "\"prompt\": [\n",
+    "                {\n",
+    "                    \"id\": \"327b63f0-b532-435a-85d7-6137d4e52c4c\",\n",
+    "                    \"name\": \"llm_counter.system_prompt\",\n",
+    "                    \"data\": \"You will answer a reasoning question. Carefully count each item and verify your total. List each item individually and ensure accuracy. Show your calculations step by step. The last line of your response should be: 'Answer: $VALUE' where VALUE is a numerical value.\",\n",
+    "                    \"requires_opt\": true\n",
+    "                },\n",
+    "                {\n",
+    "                    \"id\": \"73a3953b-6351-44d8-a36f-7521db346cca\",\n",
+    "                    \"name\": \"llm_counter.few_shot_demos\",\n",
+    "                    \"data\": \"input_str: I have a yam, a cauliflower, a bed, two cabbages, a garlic, an oven, a\\n  carrot, a head of broccoli, a potato, a stalk of celery, a lettuce head, and a toaster.\\n  How many vegetables do I have?\\nExample: 'Let''s list and count each vegetable individually:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Now, let''s verify the count:\\n\\n\\n  1. Yam\\n\\n  2. Cauliflower\\n\\n  3. Cabbage (1)\\n\\n  4. Cabbage (2)\\n\\n  5. Garlic\\n\\n  6. Carrot\\n\\n  7. Broccoli\\n\\n  8. Potato\\n\\n  9. Celery\\n\\n  10. Lettuce\\n\\n\\n  Total number of vegetables: 10\\n\\n\\n  Answer: 10'\",\n",
+    "                    \"requires_opt\": true\n",
+    "                }\n",
+    "            ]\n",
+    "```\n",
+    "\n",
+    "\n",
+    "You will see all steps record from the log."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Fr0V3XNCHAis"
+   },
+   "source": [
+    "Happy Optimizing!!!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3Wnvqs3RyI_z"
+   },
+   "source": [
+    "# Issues and feedback\n",
+    "\n",
+    "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n",
+    "\n",
+    "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)."
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/notebooks/tutorials/adalflow_dataclasses.ipynb b/notebooks/tutorials/adalflow_dataclasses.ipynb
index 5218f5e7..3c96ffe5 100644
--- a/notebooks/tutorials/adalflow_dataclasses.ipynb
+++ b/notebooks/tutorials/adalflow_dataclasses.ipynb
@@ -1,963 +1,963 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hGLYrUwBmvUD"
-      },
-      "source": [
-        "<a target=\"_blank\" href=\"https://colab.research.google.com/github.com/SylphAI-Inc/AdalFlow/blob/main/notebooks/tutorials/adalflow_dataclasses.ipynb\">\n",
-        "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
-        "</a>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gHK6HFngl6iP"
-      },
-      "source": [
-        "# 🤗 Welcome to AdalFlow!\n",
-        "## The library to build & auto-optimize any LLM task pipelines\n",
-        "\n",
-        "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ <i>Star us on <a href=\"https://github.com/SylphAI-Inc/AdalFlow\">Github</a> </i> ⭐\n",
-        "\n",
-        "\n",
-        "# Quick Links\n",
-        "\n",
-        "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n",
-        "\n",
-        "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n",
-        "\n",
-        "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n",
-        "\n",
-        "Common use cases along with the auto-optimization:  check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n",
-        "\n",
-        "# Author\n",
-        "\n",
-        "This notebook was created by community contributor [Ajith](https://github.com/ajithvcoder).\n",
-        "\n",
-        "# Outline\n",
-        "\n",
-        "This is a quick introduction of what AdalFlow is capable of. We will cover:\n",
-        "\n",
-        "* How to use `DataClass` with `DataClassParser`.\n",
-        "* How to do nested dataclass, we will test both one and two levels of nesting.\n",
-        "\n",
-        "**Next: Try our [auto-optimization](https://colab.research.google.com/drive/1n3mHUWekTEYHiBdYBTw43TKlPN41A9za?usp=sharing)**\n",
-        "\n",
-        "\n",
-        "# Installation\n",
-        "\n",
-        "1. Use `pip` to install the `adalflow` Python package. We will need `openai` and `groq`from the extra packages.\n",
-        "\n",
-        "  ```bash\n",
-        "  pip install adalflow[openai,groq]\n",
-        "  ```\n",
-        "2. Setup  `openai` and `groq` API key in the environment variables"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nqe-vxB1BCux"
-      },
-      "source": [
-        "### Install adalflow"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "ZaaevxNH9JMQ"
-      },
-      "outputs": [],
-      "source": [
-        "# Install adalflow with necessary dependencies\n",
-        "from IPython.display import clear_output\n",
-        "\n",
-        "!pip install -U adalflow[openai,groq]\n",
-        "\n",
-        "clear_output()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "NGE70aZ8BLuf"
-      },
-      "source": [
-        "### Set Environment Variables\n",
-        "\n",
-        "Note: Enter your api keys in below cell"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 23,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "j2xmGr_99YDq",
-        "outputId": "c3d1e0b7-9072-412e-fed1-4578404357be"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Overwriting .env\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%writefile .env\n",
-        "\n",
-        "OPENAI_API_KEY=\"PASTE-OPENAI_API_KEY_HERE\"\n",
-        "GROQ_API_KEY=\"PASTE-GROQ_API_KEY-HERE\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 15,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "API keys have been set.\n"
-          ]
-        }
-      ],
-      "source": [
-        "#  or more securely\n",
-        "\n",
-        "import os\n",
-        "\n",
-        "from getpass import getpass\n",
-        "\n",
-        "# Prompt user to enter their API keys securely\n",
-        "groq_api_key = getpass(\"Please enter your GROQ API key: \")\n",
-        "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
-        "\n",
-        "\n",
-        "# Set environment variables\n",
-        "os.environ['GROQ_API_KEY'] = groq_api_key\n",
-        "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
-        "\n",
-        "print(\"API keys have been set.\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZxBkm77uBZpl"
-      },
-      "source": [
-        "### Import necessary libraries"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "wOAiKg899Z2u"
-      },
-      "outputs": [],
-      "source": [
-        "# Import required libraries\n",
-        "from dataclasses import dataclass, field\n",
-        "from typing import List, Dict\n",
-        "import adalflow as adal\n",
-        "from adalflow.components.model_client import GroqAPIClient\n",
-        "from adalflow.utils import setup_env"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "'0.2.4'"
-            ]
-          },
-          "execution_count": 2,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "adal.__version__"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "bTzgyp6S9bnH"
-      },
-      "outputs": [],
-      "source": [
-        "# Load environment variables - Make sure to have OPENAI_API_KEY in .env file and .env is present in current folder\n",
-        "setup_env(\".env\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MBW5viOG9hM8"
-      },
-      "source": [
-        "### Basic Vannila Example"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "YA4pAIek9ewc"
-      },
-      "outputs": [],
-      "source": [
-        "# Define the output structure using dataclass\n",
-        "@dataclass\n",
-        "class BasicQAOutput(adal.DataClass):\n",
-        "    explanation: str = field(\n",
-        "        metadata={\"desc\": \"A brief explanation of the concept in one sentence.\"}\n",
-        "    )\n",
-        "    example: str = field(\n",
-        "        metadata={\"desc\": \"An example of the concept in a sentence.\"}\n",
-        "    )\n",
-        "    # Control output fields order\n",
-        "    __output_fields__ = [\"explanation\", \"example\"]\n",
-        "\n",
-        "# Define the template using jinja2 syntax\n",
-        "qa_template = r\"\"\"<SYS>\n",
-        "You are a helpful assistant.\n",
-        "<OUTPUT_FORMAT>\n",
-        "{{output_format_str}}\n",
-        "</OUTPUT_FORMAT>\n",
-        "</SYS>\n",
-        "<USER> {{input_str}} </USER>\"\"\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "id": "x4__jnbP9luN"
-      },
-      "outputs": [],
-      "source": [
-        "# Define the QA component\n",
-        "class QA(adal.Component):\n",
-        "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
-        "        super().__init__()\n",
-        "\n",
-        "        # Initialize the parser with the output dataclass\n",
-        "        parser = adal.DataClassParser(data_class=BasicQAOutput, return_data_class=True)\n",
-        "\n",
-        "        # Set up the generator with model, template, and parser\n",
-        "        self.generator = adal.Generator(\n",
-        "            model_client=model_client,\n",
-        "            model_kwargs=model_kwargs,\n",
-        "            template=qa_template,\n",
-        "            prompt_kwargs={\"output_format_str\": parser.get_output_format_str()},\n",
-        "            output_processors=parser,\n",
-        "        )\n",
-        "\n",
-        "    def call(self, query: str):\n",
-        "        \"\"\"Synchronous call to generate response\"\"\"\n",
-        "        return self.generator.call({\"input_str\": query})\n",
-        "\n",
-        "    async def acall(self, query: str):\n",
-        "        \"\"\"Asynchronous call to generate response\"\"\"\n",
-        "        return await self.generator.acall({\"input_str\": query})\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "id": "TVi3rGvs9nte"
-      },
-      "outputs": [],
-      "source": [
-        "# Example usage\n",
-        "def run_basic_example():\n",
-        "    # Instantiate the QA class with Groq model\n",
-        "    qa = QA(\n",
-        "        model_client=GroqAPIClient(),\n",
-        "        model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
-        "    )\n",
-        "\n",
-        "    # Print the QA instance details\n",
-        "    print(qa)\n",
-        "\n",
-        "    # Test the QA system\n",
-        "    response = qa(\"What is LLM?\")\n",
-        "    print(\"\\nResponse:\")\n",
-        "    print(response)\n",
-        "    print(f\"BasicQAOutput: {response.data}\")\n",
-        "    print(f\"Explanation: {response.data.explanation}\")\n",
-        "    print(f\"Example: {response.data.example}\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "QA(\n",
-            "  (generator): Generator(\n",
-            "    model_kwargs={'model': 'llama3-8b-8192'}, trainable_prompt_kwargs=[]\n",
-            "    (prompt): Prompt(\n",
-            "      template: <SYS>\n",
-            "      You are a helpful assistant.\n",
-            "      <OUTPUT_FORMAT>\n",
-            "      {{output_format_str}}\n",
-            "      </OUTPUT_FORMAT>\n",
-            "      </SYS>\n",
-            "      <USER> {{input_str}} </USER>, prompt_kwargs: {'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n    \"explanation\": \"A brief explanation of the concept in one sentence. (str) (required)\",\\n    \"example\": \"An example of the concept in a sentence. (str) (required)\"\\n}\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['input_str', 'output_format_str']\n",
-            "    )\n",
-            "    (model_client): GroqAPIClient()\n",
-            "    (output_processors): DataClassParser(\n",
-            "      data_class=BasicQAOutput, format_type=json,            return_data_class=True, input_fields=[],            output_fields=['explanation', 'example']\n",
-            "      (_output_processor): JsonParser()\n",
-            "      (output_format_prompt): Prompt(\n",
-            "        template: Your output should be formatted as a standard JSON instance with the following schema:\n",
-            "        ```\n",
-            "        {{schema}}\n",
-            "        ```\n",
-            "        -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n",
-            "        -Use double quotes for the keys and string values.\n",
-            "        -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\n",
-            "        -Follow the JSON formatting conventions., prompt_variables: ['schema']\n",
-            "      )\n",
-            "    )\n",
-            "  )\n",
-            ")\n",
-            "\n",
-            "Response:\n",
-            "GeneratorOutput(id=None, data=BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy'), error=None, usage=CompletionUsage(completion_tokens=60, prompt_tokens=174, total_tokens=234), raw_response='```\\n{\\n    \"explanation\": \"Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\",\\n    \"example\": \"The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\"\\n}\\n```', metadata=None)\n",
-            "BasicQAOutput: BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy')\n",
-            "Explanation: Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\n",
-            "Example: The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\n"
-          ]
-        }
-      ],
-      "source": [
-        "run_basic_example()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1n7edLQ19ql8"
-      },
-      "source": [
-        "### Example 1 - Movie analysis data class"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 13,
-      "metadata": {
-        "id": "5Arp4-Dq9u49"
-      },
-      "outputs": [],
-      "source": [
-        "# 1. Basic DataClass with different field types\n",
-        "@dataclass\n",
-        "class MovieReview(adal.DataClass):\n",
-        "    title: str = field(\n",
-        "        metadata={\"desc\": \"The title of the movie\"}\n",
-        "    )\n",
-        "    rating: float = field(\n",
-        "        metadata={\n",
-        "            \"desc\": \"Rating from 1.0 to 10.0\",\n",
-        "            \"min\": 1.0,\n",
-        "            \"max\": 10.0\n",
-        "        }\n",
-        "    )\n",
-        "    pros: List[str] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of positive points about the movie\"}\n",
-        "    )\n",
-        "    cons: List[str] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of negative points about the movie\"}\n",
-        "    )\n",
-        "\n",
-        "    __output_fields__ = [\"title\", \"rating\", \"pros\", \"cons\"]\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 14,
-      "metadata": {
-        "id": "VLbRUzXg9yP0"
-      },
-      "outputs": [],
-      "source": [
-        "\n",
-        "@dataclass\n",
-        "class Actor(adal.DataClass):\n",
-        "    name: str = field(metadata={\"desc\": \"Actor's full name\"})\n",
-        "    role: str = field(metadata={\"desc\": \"Character name in the movie\"})"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 15,
-      "metadata": {
-        "id": "7MUcu0tk91l4"
-      },
-      "outputs": [],
-      "source": [
-        "# 2. Nested DataClass example\n",
-        "\n",
-        "# Have both MovieReview and Actor nested in DetailedMovieReview\n",
-        "\n",
-        "@dataclass\n",
-        "class DetailedMovieReview(adal.DataClass):\n",
-        "    basic_review: MovieReview\n",
-        "    cast: List[Actor] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of main actors in the movie\"}\n",
-        "    )\n",
-        "    genre: List[str] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of genres for the movie\"}\n",
-        "    )\n",
-        "    recommend: bool = field(\n",
-        "        default_factory=str,\n",
-        "        metadata={\"desc\": \"Whether you would recommend this movie\"}\n",
-        "    )\n",
-        "\n",
-        "    __output_fields__ = [\"basic_review\", \"cast\", \"genre\", \"recommend\"]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 16,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Example template for movie review\n",
-        "movie_review_template = r\"\"\"<SYS>\n",
-        "You are a professional movie critic. Analyze the given movie and provide a detailed review.\n",
-        "<OUTPUT_FORMAT>\n",
-        "{{output_format_str}}\n",
-        "</OUTPUT_FORMAT>\n",
-        "</SYS>\n",
-        "<USER> Review this movie: {{movie_title}} </USER>\"\"\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 17,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Create the MovieReviewer component with MovieAnalysis data class\n",
-        "class MovieReviewer(adal.Component):\n",
-        "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict, data_class: adal.DataClass):\n",
-        "        super().__init__()\n",
-        "        self.additional_structure_prompt = \"Dont use 'type' and 'properties' in output directly give as dict\"\n",
-        "        parser = adal.DataClassParser(\n",
-        "            data_class=data_class,\n",
-        "            return_data_class=True\n",
-        "        )\n",
-        "        self.generator = adal.Generator(\n",
-        "            model_client=model_client,\n",
-        "            model_kwargs=model_kwargs,\n",
-        "            template=movie_review_template,\n",
-        "            prompt_kwargs={\"output_format_str\": parser.get_output_format_str() + self.additional_structure_prompt},\n",
-        "            output_processors=parser,\n",
-        "        )\n",
-        "\n",
-        "    def call(self, movie_title: str):\n",
-        "        return self.generator.call({\"movie_title\": movie_title})"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 18,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action'], recommend=True)\n",
-            "BasicReview: MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts'])\n",
-            "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n"
-          ]
-        }
-      ],
-      "source": [
-        "# test the data class with one level of nesting\n",
-        "\n",
-        "reviewer = MovieReviewer(\n",
-        "    model_client=GroqAPIClient(),\n",
-        "    model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
-        "    data_class=DetailedMovieReview\n",
-        ")\n",
-        "\n",
-        "response = reviewer(\"The Matrix\")\n",
-        "print(f\"DetailedMovieReview: {response.data}\")\n",
-        "print(f\"BasicReview: {response.data.basic_review}\")\n",
-        "print(f\"Cast: {response.data.cast}\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 19,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n",
-            "BasicReview: MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards'])\n",
-            "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')]\n"
-          ]
-        }
-      ],
-      "source": [
-        "# try use openai model\n",
-        "reviewer = MovieReviewer(\n",
-        "    model_client=adal.OpenAIClient(),\n",
-        "    model_kwargs={\"model\": \"gpt-4o\"},\n",
-        "    data_class=DetailedMovieReview\n",
-        ")\n",
-        "response = reviewer(\"The Matrix\")\n",
-        "print(f\"DetailedMovieReview: {response.data}\")\n",
-        "print(f\"BasicReview: {response.data.basic_review}\")\n",
-        "print(f\"Cast: {response.data.cast}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "We see both models can handle one level of nested dataclass quite well. And the output ordering will follow the ordering specified in __output_fields__"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 20,
-      "metadata": {
-        "id": "ekr4v8Xg93en"
-      },
-      "outputs": [],
-      "source": [
-        "# 3. second level nested dataclass\n",
-        "\n",
-        "@dataclass\n",
-        "class MovieAnalysis(adal.DataClass):\n",
-        "    review: DetailedMovieReview\n",
-        "    box_office: float = field(\n",
-        "        default=None,\n",
-        "        metadata={\"desc\": \"Box office earnings in millions of dollars\"}\n",
-        "    )\n",
-        "    awards: Dict[str, int] = field(\n",
-        "        default=None,\n",
-        "        metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n",
-        "    )\n",
-        "\n",
-        "    __output_fields__ = [\"review\", \"box_office\", \"awards\"]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 25,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True), box_office=463.5, awards={'Best Visual Effects': 4, 'Best Film Editing': 2, 'Best Sound': 1})\n",
-            "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True)\n",
-            "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation'])\n",
-            "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n"
-          ]
-        }
-      ],
-      "source": [
-        "# test the data class with two levels of nested dataclass\n",
-        "\n",
-        "# gpt-3.5-turbo model\n",
-        "\n",
-        "analysis = MovieReviewer(\n",
-        "    model_client=adal.OpenAIClient(),\n",
-        "    model_kwargs={\"model\": \"gpt-3.5-turbo\"},\n",
-        "    data_class=MovieAnalysis\n",
-        ")\n",
-        "\n",
-        "response = analysis(\"The Matrix\")\n",
-        "print(f\"MovieAnalysis: {response.data}\")\n",
-        "print(f\"DetailedMovieReview: {response.data.review}\")\n",
-        "print(f\"BasicReview: {response.data.review.basic_review}\")\n",
-        "print(f\"Cast: {response.data.review.cast}\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 24,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True), box_office=463.5, awards={'Academy Awards': 4, 'MTV Movie Awards': 10, 'Saturn Awards': 7})\n",
-            "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n",
-            "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts'])\n",
-            "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n"
-          ]
-        }
-      ],
-      "source": [
-        "# test the data class with two levels of nested dataclass\n",
-        "\n",
-        "analysis = MovieReviewer(\n",
-        "    model_client=GroqAPIClient(),\n",
-        "    model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
-        "    data_class=MovieAnalysis\n",
-        ")\n",
-        "\n",
-        "response = analysis(\"The Matrix\")\n",
-        "print(f\"MovieAnalysis: {response.data}\")\n",
-        "print(f\"DetailedMovieReview: {response.data.review}\")\n",
-        "print(f\"BasicReview: {response.data.review.basic_review}\")\n",
-        "print(f\"Cast: {response.data.review.cast}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pSTrf8_t-DCx"
-      },
-      "source": [
-        "### Example 2: Song Review\n",
-        "Note: Song Review is modified by keeping Example 1 - Movie Review as a reference so that we would know how to use DataClasses for similar purposes"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 26,
-      "metadata": {
-        "id": "7g9bUa0q-B6Y"
-      },
-      "outputs": [],
-      "source": [
-        "# 1. Basic DataClass with different field types\n",
-        "@dataclass\n",
-        "class SongReview(adal.DataClass):\n",
-        "    title: str = field(\n",
-        "        metadata={\"desc\": \"The title of the song\"}\n",
-        "    )\n",
-        "    album: str = field(\n",
-        "        metadata={\"desc\": \"The album of the song\"}\n",
-        "    )\n",
-        "    ranking: int = field(\n",
-        "        metadata={\n",
-        "            \"desc\": \"Billboard peak ranking from 1 to 200\",\n",
-        "            \"min\": 1,\n",
-        "            \"max\": 200\n",
-        "        }\n",
-        "    )\n",
-        "    streaming: Dict[str, int] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"Dict of lastest approximate streaming count in spotify and in youtube. Gives the count in millions\"}\n",
-        "    )\n",
-        "    pros: List[str] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of positive points about the song\"}\n",
-        "    )\n",
-        "    cons: List[str] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of negative points about the song\"}\n",
-        "    )\n",
-        "\n",
-        "    __output_fields__ = [\"title\", \"rating\", \"streaming\", \"pros\", \"cons\"]\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 27,
-      "metadata": {
-        "id": "UGhMRZht-HiB"
-      },
-      "outputs": [],
-      "source": [
-        "\n",
-        "@dataclass\n",
-        "class Artist(adal.DataClass):\n",
-        "    name: str = field(metadata={\"desc\": \"Artist's full name\"})\n",
-        "    role: str = field(metadata={\"desc\": \"Artist's role in the song\"})"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 28,
-      "metadata": {
-        "id": "sfNWgPYN-JAj"
-      },
-      "outputs": [],
-      "source": [
-        "# 2. Nested DataClass example\n",
-        "\n",
-        "@dataclass\n",
-        "class DetailedSongReview(adal.DataClass):\n",
-        "    basic_review: SongReview = field(\n",
-        "        default=SongReview, metadata={\"desc\": \"basic Song review details\"}\n",
-        "    )\n",
-        "    cast: List[Artist] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of main singer, lyrisist and musicians in the song\"}\n",
-        "    )\n",
-        "    genre: List[str] = field(\n",
-        "        default_factory=list,\n",
-        "        metadata={\"desc\": \"List of genres for the song\"}\n",
-        "    )\n",
-        "    recommend: bool = field(\n",
-        "        default_factory=str,\n",
-        "        metadata={\"desc\": \"Whether you would recommend this song\"}\n",
-        "    )\n",
-        "\n",
-        "    __output_fields__ = [\"basic_review\", \"cast\", \"genre\", \"recommend\"]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 29,
-      "metadata": {
-        "id": "HG8rtCd8-K7t"
-      },
-      "outputs": [],
-      "source": [
-        "# 3. two levels of nesting dataclass\n",
-        "\n",
-        "# all these fields as we use default, it is optional, so \n",
-        "# llm might not output that field if they dont have information\n",
-        "\n",
-        "@dataclass\n",
-        "class SongAnalysis(adal.DataClass):\n",
-        "    review: DetailedSongReview = field(\n",
-        "        default=DetailedSongReview, metadata={\"desc\": \"Song review details\"}\n",
-        "    )\n",
-        "    duration: float = field(\n",
-        "        default=None,\n",
-        "        metadata={\"desc\": \"Duration of the song\"}\n",
-        "    )\n",
-        "    awards: Dict[str, int] = field(\n",
-        "        default=None,\n",
-        "        metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n",
-        "    )\n",
-        "\n",
-        "    __output_fields__ = [\"review\", \"duration\", \"awards\"]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 30,
-      "metadata": {
-        "id": "v3mNeyz7-MpY"
-      },
-      "outputs": [],
-      "source": [
-        "# Example template for song review\n",
-        "song_review_template = r\"\"\"<SYS>\n",
-        "You are a professional song critic. Analyze the given song and provide a detailed review.\n",
-        "<OUTPUT_FORMAT>\n",
-        "{{output_format_str}}\n",
-        "</OUTPUT_FORMAT>\n",
-        "</SYS>\n",
-        "<USER> Review this song: {{song_title}} </USER>\"\"\"\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 31,
-      "metadata": {
-        "id": "X2eifXOU-OrE"
-      },
-      "outputs": [],
-      "source": [
-        "# Create the SongReviewer component with SongAnalysis data class\n",
-        "class SongReviewer(adal.Component):\n",
-        "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
-        "        super().__init__()\n",
-        "        self.additional_structure_prompt = \"Dont use 'type' and 'properties' in output directly give as dict\"\n",
-        "        parser = adal.DataClassParser(\n",
-        "            data_class=SongAnalysis,\n",
-        "            return_data_class=False,\n",
-        "            format_type=\"json\"\n",
-        "        )\n",
-        "        self.generator = adal.Generator(\n",
-        "            model_client=model_client,\n",
-        "            model_kwargs=model_kwargs,\n",
-        "            template=song_review_template,\n",
-        "            prompt_kwargs={\"output_format_str\": parser.get_output_format_str() + self.additional_structure_prompt },\n",
-        "            output_processors=parser,\n",
-        "        )\n",
-        "\n",
-        "    def call(self, song_title: str):\n",
-        "        return self.generator.call({\"song_title\": song_title})"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 36,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "SongAnalysis: {'review': {'basic_review': {'title': 'Shape of You', 'album': '÷ (Divide)', 'ranking': 7, 'streaming': {'spotify': 4.5, 'youtube': 2.5}, 'pros': ['Catchy beat', 'Catchy melody', 'Funky rhythm', 'Great lyrics'], 'cons': ['Some may find the lyrics objectifying', 'Not typically my cup of tea']}, 'cast': [{'name': 'Ed Sheeran', 'role': 'Lead vocals, songwriting'}], 'genre': ['Pop', 'Dance', 'Electro'], 'recommend': True}, 'duration': 3.53}\n"
-          ]
-        }
-      ],
-      "source": [
-        "analysis = SongReviewer(\n",
-        "     model_client=GroqAPIClient(),\n",
-        "     model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
-        ")\n",
-        "\n",
-        "response = analysis(\"Shape of you\")\n",
-        "print(f\"SongAnalysis: {response.data}\")\n",
-        "\n",
-        "# this time as we set `return_data_class` to False in the parser, we get the output as dict"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 38,
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Song Title: Shape of You\n",
-            "Album: ÷ (Divide)\n",
-            "Ranking: 7\n",
-            "- spotify - 4.5 million views\n",
-            "- youtube - 2.5 million views\n",
-            "\n",
-            "Pros:\n",
-            "- Catchy beat\n",
-            "- Catchy melody\n",
-            "- Funky rhythm\n",
-            "- Great lyrics\n",
-            "\n",
-            "Artist's:\n",
-            "- Ed Sheeran as Lead vocals, songwriting\n",
-            "\n",
-            "Genere:  \n",
-            " Pop \n",
-            " Dance \n",
-            " Electro \n",
-            "\n",
-            "Duration: 3.53 minutes\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Access nested data\n",
-        "analysis = response.data\n",
-        "print(f\"Song Title: {analysis['review']['basic_review']['title']}\")\n",
-        "print(f\"Album: {analysis['review']['basic_review']['album']}\")\n",
-        "print(f\"Ranking: {analysis['review']['basic_review']['ranking']}\")\n",
-        "\n",
-        "for platform, views in analysis['review']['basic_review']['streaming'].items():\n",
-        "    print(f\"- {platform} - {views} million views\")\n",
-        "print(\"\\nPros:\")\n",
-        "for pro in analysis['review'][\"basic_review\"][\"pros\"]:\n",
-        "    print(f\"- {pro}\")\n",
-        "\n",
-        "print(\"\\nArtist's:\")\n",
-        "for actor in analysis['review'][\"cast\"]:\n",
-        "        print(f\"- {actor['name']} as {actor['role']}\")\n",
-        "\n",
-        "if analysis['review']['genre']:\n",
-        "    print(f\"\\nGenere:  \")\n",
-        "    for genre in analysis['review']['genre']:\n",
-        "        print(f\" {genre} \")\n",
-        "\n",
-        "if analysis['duration']:\n",
-        "    print(f\"\\nDuration: {analysis['duration']} minutes\")\n",
-        "\n",
-        "if hasattr(analysis, 'awards') and analysis['awards']:\n",
-        "    print(\"\\nAwards:\")\n",
-        "    for category, count in analysis['awards'].items():\n",
-        "        print(f\"- {category}: {count}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "TODOs:\n",
-        "1. Add `JsonOutputParser` and `YamlOutputParser` to this notebook."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BLAF5qTEmoyW"
-      },
-      "source": [
-        "# Issues and feedback\n",
-        "\n",
-        "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n",
-        "\n",
-        "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)."
-      ]
-    }
-  ],
-  "metadata": {
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "hGLYrUwBmvUD"
+   },
+   "source": [
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/github.com/SylphAI-Inc/AdalFlow/blob/main/notebooks/tutorials/adalflow_dataclasses.ipynb\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gHK6HFngl6iP"
+   },
+   "source": [
+    "# 🤗 Welcome to AdalFlow!\n",
+    "## The library to build & auto-optimize any LLM task pipelines\n",
+    "\n",
+    "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ <i>Star us on <a href=\"https://github.com/SylphAI-Inc/AdalFlow\">Github</a> </i> ⭐\n",
+    "\n",
+    "\n",
+    "# Quick Links\n",
+    "\n",
+    "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n",
+    "\n",
+    "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n",
+    "\n",
+    "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n",
+    "\n",
+    "Common use cases along with the auto-optimization:  check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n",
+    "\n",
+    "# Author\n",
+    "\n",
+    "This notebook was created by community contributor [Ajith](https://github.com/ajithvcoder).\n",
+    "\n",
+    "# Outline\n",
+    "\n",
+    "This is a quick introduction of what AdalFlow is capable of. We will cover:\n",
+    "\n",
+    "* How to use `DataClass` with `DataClassParser`.\n",
+    "* How to do nested dataclass, we will test both one and two levels of nesting.\n",
+    "\n",
+    "**Next: Try our [auto-optimization](https://colab.research.google.com/drive/1n3mHUWekTEYHiBdYBTw43TKlPN41A9za?usp=sharing)**\n",
+    "\n",
+    "\n",
+    "# Installation\n",
+    "\n",
+    "1. Use `pip` to install the `adalflow` Python package. We will need `openai` and `groq`from the extra packages.\n",
+    "\n",
+    "  ```bash\n",
+    "  pip install adalflow[openai,groq]\n",
+    "  ```\n",
+    "2. Setup  `openai` and `groq` API key in the environment variables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nqe-vxB1BCux"
+   },
+   "source": [
+    "### Install adalflow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "ZaaevxNH9JMQ"
+   },
+   "outputs": [],
+   "source": [
+    "# Install adalflow with necessary dependencies\n",
+    "from IPython.display import clear_output\n",
+    "\n",
+    "!pip install -U adalflow[openai,groq]\n",
+    "\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "NGE70aZ8BLuf"
+   },
+   "source": [
+    "### Set Environment Variables\n",
+    "\n",
+    "Note: Enter your api keys in below cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
     "colab": {
-      "collapsed_sections": [
-        "nqe-vxB1BCux",
-        "NGE70aZ8BLuf"
-      ],
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "my-project-kernel",
-      "language": "python",
-      "name": "my-project-kernel"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.12.4"
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "j2xmGr_99YDq",
+    "outputId": "c3d1e0b7-9072-412e-fed1-4578404357be"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting .env\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile .env\n",
+    "\n",
+    "OPENAI_API_KEY=\"PASTE-OPENAI_API_KEY_HERE\"\n",
+    "GROQ_API_KEY=\"PASTE-GROQ_API_KEY-HERE\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "API keys have been set.\n"
+     ]
     }
+   ],
+   "source": [
+    "#  or more securely\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "from getpass import getpass\n",
+    "\n",
+    "# Prompt user to enter their API keys securely\n",
+    "groq_api_key = getpass(\"Please enter your GROQ API key: \")\n",
+    "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
+    "\n",
+    "\n",
+    "# Set environment variables\n",
+    "os.environ['GROQ_API_KEY'] = groq_api_key\n",
+    "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
+    "\n",
+    "print(\"API keys have been set.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZxBkm77uBZpl"
+   },
+   "source": [
+    "### Import necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "wOAiKg899Z2u"
+   },
+   "outputs": [],
+   "source": [
+    "# Import required libraries\n",
+    "from dataclasses import dataclass, field\n",
+    "from typing import List, Dict\n",
+    "import adalflow as adal\n",
+    "from adalflow.components.model_client import GroqAPIClient\n",
+    "from adalflow.utils import setup_env"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'0.2.4'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adal.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bTzgyp6S9bnH"
+   },
+   "outputs": [],
+   "source": [
+    "# Load environment variables - Make sure to have OPENAI_API_KEY in .env file and .env is present in current folder\n",
+    "setup_env(\".env\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MBW5viOG9hM8"
+   },
+   "source": [
+    "### Basic Vannila Example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "YA4pAIek9ewc"
+   },
+   "outputs": [],
+   "source": [
+    "# Define the output structure using dataclass\n",
+    "@dataclass\n",
+    "class BasicQAOutput(adal.DataClass):\n",
+    "    explanation: str = field(\n",
+    "        metadata={\"desc\": \"A brief explanation of the concept in one sentence.\"}\n",
+    "    )\n",
+    "    example: str = field(\n",
+    "        metadata={\"desc\": \"An example of the concept in a sentence.\"}\n",
+    "    )\n",
+    "    # Control output fields order\n",
+    "    __output_fields__ = [\"explanation\", \"example\"]\n",
+    "\n",
+    "# Define the template using jinja2 syntax\n",
+    "qa_template = r\"\"\"<SYS>\n",
+    "You are a helpful assistant.\n",
+    "<OUTPUT_FORMAT>\n",
+    "{{output_format_str}}\n",
+    "</OUTPUT_FORMAT>\n",
+    "</SYS>\n",
+    "<USER> {{input_str}} </USER>\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "x4__jnbP9luN"
+   },
+   "outputs": [],
+   "source": [
+    "# Define the QA component\n",
+    "class QA(adal.Component):\n",
+    "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        # Initialize the parser with the output dataclass\n",
+    "        parser = adal.DataClassParser(data_class=BasicQAOutput, return_data_class=True)\n",
+    "\n",
+    "        # Set up the generator with model, template, and parser\n",
+    "        self.generator = adal.Generator(\n",
+    "            model_client=model_client,\n",
+    "            model_kwargs=model_kwargs,\n",
+    "            template=qa_template,\n",
+    "            prompt_kwargs={\"output_format_str\": parser.get_output_format_str()},\n",
+    "            output_processors=parser,\n",
+    "        )\n",
+    "\n",
+    "    def call(self, query: str):\n",
+    "        \"\"\"Synchronous call to generate response\"\"\"\n",
+    "        return self.generator.call({\"input_str\": query})\n",
+    "\n",
+    "    async def acall(self, query: str):\n",
+    "        \"\"\"Asynchronous call to generate response\"\"\"\n",
+    "        return await self.generator.acall({\"input_str\": query})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "TVi3rGvs9nte"
+   },
+   "outputs": [],
+   "source": [
+    "# Example usage\n",
+    "def run_basic_example():\n",
+    "    # Instantiate the QA class with Groq model\n",
+    "    qa = QA(\n",
+    "        model_client=GroqAPIClient(),\n",
+    "        model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
+    "    )\n",
+    "\n",
+    "    # Print the QA instance details\n",
+    "    print(qa)\n",
+    "\n",
+    "    # Test the QA system\n",
+    "    response = qa(\"What is LLM?\")\n",
+    "    print(\"\\nResponse:\")\n",
+    "    print(response)\n",
+    "    print(f\"BasicQAOutput: {response.data}\")\n",
+    "    print(f\"Explanation: {response.data.explanation}\")\n",
+    "    print(f\"Example: {response.data.example}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "QA(\n",
+      "  (generator): Generator(\n",
+      "    model_kwargs={'model': 'llama3-8b-8192'}, trainable_prompt_kwargs=[]\n",
+      "    (prompt): Prompt(\n",
+      "      template: <SYS>\n",
+      "      You are a helpful assistant.\n",
+      "      <OUTPUT_FORMAT>\n",
+      "      {{output_format_str}}\n",
+      "      </OUTPUT_FORMAT>\n",
+      "      </SYS>\n",
+      "      <USER> {{input_str}} </USER>, prompt_kwargs: {'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n    \"explanation\": \"A brief explanation of the concept in one sentence. (str) (required)\",\\n    \"example\": \"An example of the concept in a sentence. (str) (required)\"\\n}\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['input_str', 'output_format_str']\n",
+      "    )\n",
+      "    (model_client): GroqAPIClient()\n",
+      "    (output_processors): DataClassParser(\n",
+      "      data_class=BasicQAOutput, format_type=json,            return_data_class=True, input_fields=[],            output_fields=['explanation', 'example']\n",
+      "      (_output_processor): JsonParser()\n",
+      "      (output_format_prompt): Prompt(\n",
+      "        template: Your output should be formatted as a standard JSON instance with the following schema:\n",
+      "        ```\n",
+      "        {{schema}}\n",
+      "        ```\n",
+      "        -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n",
+      "        -Use double quotes for the keys and string values.\n",
+      "        -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\n",
+      "        -Follow the JSON formatting conventions., prompt_variables: ['schema']\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      ")\n",
+      "\n",
+      "Response:\n",
+      "GeneratorOutput(id=None, data=BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy'), error=None, usage=CompletionUsage(completion_tokens=60, prompt_tokens=174, total_tokens=234), raw_response='```\\n{\\n    \"explanation\": \"Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\",\\n    \"example\": \"The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\"\\n}\\n```', metadata=None)\n",
+      "BasicQAOutput: BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy')\n",
+      "Explanation: Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\n",
+      "Example: The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\n"
+     ]
+    }
+   ],
+   "source": [
+    "run_basic_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1n7edLQ19ql8"
+   },
+   "source": [
+    "### Example 1 - Movie analysis data class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "id": "5Arp4-Dq9u49"
+   },
+   "outputs": [],
+   "source": [
+    "# 1. Basic DataClass with different field types\n",
+    "@dataclass\n",
+    "class MovieReview(adal.DataClass):\n",
+    "    title: str = field(\n",
+    "        metadata={\"desc\": \"The title of the movie\"}\n",
+    "    )\n",
+    "    rating: float = field(\n",
+    "        metadata={\n",
+    "            \"desc\": \"Rating from 1.0 to 10.0\",\n",
+    "            \"min\": 1.0,\n",
+    "            \"max\": 10.0\n",
+    "        }\n",
+    "    )\n",
+    "    pros: List[str] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of positive points about the movie\"}\n",
+    "    )\n",
+    "    cons: List[str] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of negative points about the movie\"}\n",
+    "    )\n",
+    "\n",
+    "    __output_fields__ = [\"title\", \"rating\", \"pros\", \"cons\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "id": "VLbRUzXg9yP0"
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "@dataclass\n",
+    "class Actor(adal.DataClass):\n",
+    "    name: str = field(metadata={\"desc\": \"Actor's full name\"})\n",
+    "    role: str = field(metadata={\"desc\": \"Character name in the movie\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "id": "7MUcu0tk91l4"
+   },
+   "outputs": [],
+   "source": [
+    "# 2. Nested DataClass example\n",
+    "\n",
+    "# Have both MovieReview and Actor nested in DetailedMovieReview\n",
+    "\n",
+    "@dataclass\n",
+    "class DetailedMovieReview(adal.DataClass):\n",
+    "    basic_review: MovieReview\n",
+    "    cast: List[Actor] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of main actors in the movie\"}\n",
+    "    )\n",
+    "    genre: List[str] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of genres for the movie\"}\n",
+    "    )\n",
+    "    recommend: bool = field(\n",
+    "        default_factory=str,\n",
+    "        metadata={\"desc\": \"Whether you would recommend this movie\"}\n",
+    "    )\n",
+    "\n",
+    "    __output_fields__ = [\"basic_review\", \"cast\", \"genre\", \"recommend\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example template for movie review\n",
+    "movie_review_template = r\"\"\"<SYS>\n",
+    "You are a professional movie critic. Analyze the given movie and provide a detailed review.\n",
+    "<OUTPUT_FORMAT>\n",
+    "{{output_format_str}}\n",
+    "</OUTPUT_FORMAT>\n",
+    "</SYS>\n",
+    "<USER> Review this movie: {{movie_title}} </USER>\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the MovieReviewer component with MovieAnalysis data class\n",
+    "class MovieReviewer(adal.Component):\n",
+    "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict, data_class: adal.DataClass):\n",
+    "        super().__init__()\n",
+    "        self.additional_structure_prompt = \"Dont use 'type' and 'properties' in output directly give as dict\"\n",
+    "        parser = adal.DataClassParser(\n",
+    "            data_class=data_class,\n",
+    "            return_data_class=True\n",
+    "        )\n",
+    "        self.generator = adal.Generator(\n",
+    "            model_client=model_client,\n",
+    "            model_kwargs=model_kwargs,\n",
+    "            template=movie_review_template,\n",
+    "            prompt_kwargs={\"output_format_str\": parser.get_output_format_str() + self.additional_structure_prompt},\n",
+    "            output_processors=parser,\n",
+    "        )\n",
+    "\n",
+    "    def call(self, movie_title: str):\n",
+    "        return self.generator.call({\"movie_title\": movie_title})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action'], recommend=True)\n",
+      "BasicReview: MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts'])\n",
+      "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test the data class with one level of nesting\n",
+    "\n",
+    "reviewer = MovieReviewer(\n",
+    "    model_client=GroqAPIClient(),\n",
+    "    model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
+    "    data_class=DetailedMovieReview\n",
+    ")\n",
+    "\n",
+    "response = reviewer(\"The Matrix\")\n",
+    "print(f\"DetailedMovieReview: {response.data}\")\n",
+    "print(f\"BasicReview: {response.data.basic_review}\")\n",
+    "print(f\"Cast: {response.data.cast}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n",
+      "BasicReview: MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards'])\n",
+      "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# try use openai model\n",
+    "reviewer = MovieReviewer(\n",
+    "    model_client=adal.OpenAIClient(),\n",
+    "    model_kwargs={\"model\": \"gpt-4o\"},\n",
+    "    data_class=DetailedMovieReview\n",
+    ")\n",
+    "response = reviewer(\"The Matrix\")\n",
+    "print(f\"DetailedMovieReview: {response.data}\")\n",
+    "print(f\"BasicReview: {response.data.basic_review}\")\n",
+    "print(f\"Cast: {response.data.cast}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We see both models can handle one level of nested dataclass quite well. And the output ordering will follow the ordering specified in __output_fields__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "id": "ekr4v8Xg93en"
+   },
+   "outputs": [],
+   "source": [
+    "# 3. second level nested dataclass\n",
+    "\n",
+    "@dataclass\n",
+    "class MovieAnalysis(adal.DataClass):\n",
+    "    review: DetailedMovieReview\n",
+    "    box_office: float = field(\n",
+    "        default=None,\n",
+    "        metadata={\"desc\": \"Box office earnings in millions of dollars\"}\n",
+    "    )\n",
+    "    awards: Dict[str, int] = field(\n",
+    "        default=None,\n",
+    "        metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n",
+    "    )\n",
+    "\n",
+    "    __output_fields__ = [\"review\", \"box_office\", \"awards\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True), box_office=463.5, awards={'Best Visual Effects': 4, 'Best Film Editing': 2, 'Best Sound': 1})\n",
+      "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True)\n",
+      "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation'])\n",
+      "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test the data class with two levels of nested dataclass\n",
+    "\n",
+    "# gpt-3.5-turbo model\n",
+    "\n",
+    "analysis = MovieReviewer(\n",
+    "    model_client=adal.OpenAIClient(),\n",
+    "    model_kwargs={\"model\": \"gpt-3.5-turbo\"},\n",
+    "    data_class=MovieAnalysis\n",
+    ")\n",
+    "\n",
+    "response = analysis(\"The Matrix\")\n",
+    "print(f\"MovieAnalysis: {response.data}\")\n",
+    "print(f\"DetailedMovieReview: {response.data.review}\")\n",
+    "print(f\"BasicReview: {response.data.review.basic_review}\")\n",
+    "print(f\"Cast: {response.data.review.cast}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True), box_office=463.5, awards={'Academy Awards': 4, 'MTV Movie Awards': 10, 'Saturn Awards': 7})\n",
+      "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n",
+      "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts'])\n",
+      "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test the data class with two levels of nested dataclass\n",
+    "\n",
+    "analysis = MovieReviewer(\n",
+    "    model_client=GroqAPIClient(),\n",
+    "    model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
+    "    data_class=MovieAnalysis\n",
+    ")\n",
+    "\n",
+    "response = analysis(\"The Matrix\")\n",
+    "print(f\"MovieAnalysis: {response.data}\")\n",
+    "print(f\"DetailedMovieReview: {response.data.review}\")\n",
+    "print(f\"BasicReview: {response.data.review.basic_review}\")\n",
+    "print(f\"Cast: {response.data.review.cast}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pSTrf8_t-DCx"
+   },
+   "source": [
+    "### Example 2: Song Review\n",
+    "Note: Song Review is modified by keeping Example 1 - Movie Review as a reference so that we would know how to use DataClasses for similar purposes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "id": "7g9bUa0q-B6Y"
+   },
+   "outputs": [],
+   "source": [
+    "# 1. Basic DataClass with different field types\n",
+    "@dataclass\n",
+    "class SongReview(adal.DataClass):\n",
+    "    title: str = field(\n",
+    "        metadata={\"desc\": \"The title of the song\"}\n",
+    "    )\n",
+    "    album: str = field(\n",
+    "        metadata={\"desc\": \"The album of the song\"}\n",
+    "    )\n",
+    "    ranking: int = field(\n",
+    "        metadata={\n",
+    "            \"desc\": \"Billboard peak ranking from 1 to 200\",\n",
+    "            \"min\": 1,\n",
+    "            \"max\": 200\n",
+    "        }\n",
+    "    )\n",
+    "    streaming: Dict[str, int] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"Dict of lastest approximate streaming count in spotify and in youtube. Gives the count in millions\"}\n",
+    "    )\n",
+    "    pros: List[str] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of positive points about the song\"}\n",
+    "    )\n",
+    "    cons: List[str] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of negative points about the song\"}\n",
+    "    )\n",
+    "\n",
+    "    __output_fields__ = [\"title\", \"rating\", \"streaming\", \"pros\", \"cons\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "id": "UGhMRZht-HiB"
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "@dataclass\n",
+    "class Artist(adal.DataClass):\n",
+    "    name: str = field(metadata={\"desc\": \"Artist's full name\"})\n",
+    "    role: str = field(metadata={\"desc\": \"Artist's role in the song\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "id": "sfNWgPYN-JAj"
+   },
+   "outputs": [],
+   "source": [
+    "# 2. Nested DataClass example\n",
+    "\n",
+    "@dataclass\n",
+    "class DetailedSongReview(adal.DataClass):\n",
+    "    basic_review: SongReview = field(\n",
+    "        default=SongReview, metadata={\"desc\": \"basic Song review details\"}\n",
+    "    )\n",
+    "    cast: List[Artist] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of main singer, lyrisist and musicians in the song\"}\n",
+    "    )\n",
+    "    genre: List[str] = field(\n",
+    "        default_factory=list,\n",
+    "        metadata={\"desc\": \"List of genres for the song\"}\n",
+    "    )\n",
+    "    recommend: bool = field(\n",
+    "        default_factory=str,\n",
+    "        metadata={\"desc\": \"Whether you would recommend this song\"}\n",
+    "    )\n",
+    "\n",
+    "    __output_fields__ = [\"basic_review\", \"cast\", \"genre\", \"recommend\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "id": "HG8rtCd8-K7t"
+   },
+   "outputs": [],
+   "source": [
+    "# 3. two levels of nesting dataclass\n",
+    "\n",
+    "# all these fields as we use default, it is optional, so \n",
+    "# llm might not output that field if they dont have information\n",
+    "\n",
+    "@dataclass\n",
+    "class SongAnalysis(adal.DataClass):\n",
+    "    review: DetailedSongReview = field(\n",
+    "        default=DetailedSongReview, metadata={\"desc\": \"Song review details\"}\n",
+    "    )\n",
+    "    duration: float = field(\n",
+    "        default=None,\n",
+    "        metadata={\"desc\": \"Duration of the song\"}\n",
+    "    )\n",
+    "    awards: Dict[str, int] = field(\n",
+    "        default=None,\n",
+    "        metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n",
+    "    )\n",
+    "\n",
+    "    __output_fields__ = [\"review\", \"duration\", \"awards\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "id": "v3mNeyz7-MpY"
+   },
+   "outputs": [],
+   "source": [
+    "# Example template for song review\n",
+    "song_review_template = r\"\"\"<SYS>\n",
+    "You are a professional song critic. Analyze the given song and provide a detailed review.\n",
+    "<OUTPUT_FORMAT>\n",
+    "{{output_format_str}}\n",
+    "</OUTPUT_FORMAT>\n",
+    "</SYS>\n",
+    "<USER> Review this song: {{song_title}} </USER>\"\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "id": "X2eifXOU-OrE"
+   },
+   "outputs": [],
+   "source": [
+    "# Create the SongReviewer component with SongAnalysis data class\n",
+    "class SongReviewer(adal.Component):\n",
+    "    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n",
+    "        super().__init__()\n",
+    "        self.additional_structure_prompt = \"Dont use 'type' and 'properties' in output directly give as dict\"\n",
+    "        parser = adal.DataClassParser(\n",
+    "            data_class=SongAnalysis,\n",
+    "            return_data_class=False,\n",
+    "            format_type=\"json\"\n",
+    "        )\n",
+    "        self.generator = adal.Generator(\n",
+    "            model_client=model_client,\n",
+    "            model_kwargs=model_kwargs,\n",
+    "            template=song_review_template,\n",
+    "            prompt_kwargs={\"output_format_str\": parser.get_output_format_str() + self.additional_structure_prompt },\n",
+    "            output_processors=parser,\n",
+    "        )\n",
+    "\n",
+    "    def call(self, song_title: str):\n",
+    "        return self.generator.call({\"song_title\": song_title})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SongAnalysis: {'review': {'basic_review': {'title': 'Shape of You', 'album': '÷ (Divide)', 'ranking': 7, 'streaming': {'spotify': 4.5, 'youtube': 2.5}, 'pros': ['Catchy beat', 'Catchy melody', 'Funky rhythm', 'Great lyrics'], 'cons': ['Some may find the lyrics objectifying', 'Not typically my cup of tea']}, 'cast': [{'name': 'Ed Sheeran', 'role': 'Lead vocals, songwriting'}], 'genre': ['Pop', 'Dance', 'Electro'], 'recommend': True}, 'duration': 3.53}\n"
+     ]
+    }
+   ],
+   "source": [
+    "analysis = SongReviewer(\n",
+    "     model_client=GroqAPIClient(),\n",
+    "     model_kwargs={\"model\": \"llama3-8b-8192\"},\n",
+    ")\n",
+    "\n",
+    "response = analysis(\"Shape of you\")\n",
+    "print(f\"SongAnalysis: {response.data}\")\n",
+    "\n",
+    "# this time as we set `return_data_class` to False in the parser, we get the output as dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Song Title: Shape of You\n",
+      "Album: ÷ (Divide)\n",
+      "Ranking: 7\n",
+      "- spotify - 4.5 million views\n",
+      "- youtube - 2.5 million views\n",
+      "\n",
+      "Pros:\n",
+      "- Catchy beat\n",
+      "- Catchy melody\n",
+      "- Funky rhythm\n",
+      "- Great lyrics\n",
+      "\n",
+      "Artist's:\n",
+      "- Ed Sheeran as Lead vocals, songwriting\n",
+      "\n",
+      "Genere:  \n",
+      " Pop \n",
+      " Dance \n",
+      " Electro \n",
+      "\n",
+      "Duration: 3.53 minutes\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Access nested data\n",
+    "analysis = response.data\n",
+    "print(f\"Song Title: {analysis['review']['basic_review']['title']}\")\n",
+    "print(f\"Album: {analysis['review']['basic_review']['album']}\")\n",
+    "print(f\"Ranking: {analysis['review']['basic_review']['ranking']}\")\n",
+    "\n",
+    "for platform, views in analysis['review']['basic_review']['streaming'].items():\n",
+    "    print(f\"- {platform} - {views} million views\")\n",
+    "print(\"\\nPros:\")\n",
+    "for pro in analysis['review'][\"basic_review\"][\"pros\"]:\n",
+    "    print(f\"- {pro}\")\n",
+    "\n",
+    "print(\"\\nArtist's:\")\n",
+    "for actor in analysis['review'][\"cast\"]:\n",
+    "        print(f\"- {actor['name']} as {actor['role']}\")\n",
+    "\n",
+    "if analysis['review']['genre']:\n",
+    "    print(\"\\nGenere:  \")\n",
+    "    for genre in analysis['review']['genre']:\n",
+    "        print(f\" {genre} \")\n",
+    "\n",
+    "if analysis['duration']:\n",
+    "    print(f\"\\nDuration: {analysis['duration']} minutes\")\n",
+    "\n",
+    "if hasattr(analysis, 'awards') and analysis['awards']:\n",
+    "    print(\"\\nAwards:\")\n",
+    "    for category, count in analysis['awards'].items():\n",
+    "        print(f\"- {category}: {count}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "TODOs:\n",
+    "1. Add `JsonOutputParser` and `YamlOutputParser` to this notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "BLAF5qTEmoyW"
+   },
+   "source": [
+    "# Issues and feedback\n",
+    "\n",
+    "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n",
+    "\n",
+    "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)."
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [
+    "nqe-vxB1BCux",
+    "NGE70aZ8BLuf"
+   ],
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "my-project-kernel",
+   "language": "python",
+   "name": "my-project-kernel"
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/notebooks/tutorials/adalflow_text_splitter.ipynb b/notebooks/tutorials/adalflow_text_splitter.ipynb
index 1426695e..66fb81c7 100644
--- a/notebooks/tutorials/adalflow_text_splitter.ipynb
+++ b/notebooks/tutorials/adalflow_text_splitter.ipynb
@@ -1,171 +1,170 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "A99Pp0T7A9BM"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install adalflow[openai,groq,faiss-cpu]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "y2SVUBNeBMy5"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from getpass import getpass\n",
+    "\n",
+    "# You can use a setup_env file to set the OPENAI_API_KEY too\n",
+    "# (ensure you setup OPENAI_API_KEY in your project .env file) using the following commands:\n",
+    "# from adalflow.utils import setup_env\n",
+    "\n",
+    "# Prompt user to enter their API keys securely\n",
+    "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
+    "\n",
+    "# Set environment variables\n",
+    "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
+    "\n",
+    "print(\"API keys have been set.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
     "colab": {
-      "provenance": []
+     "base_uri": "https://localhost:8080/"
     },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
+    "id": "RWWG9WRt2r9L",
+    "outputId": "faad52a8-47f5-48bc-e2c3-17a5aea21254"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 788.85it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document(id=6374a3e5-2ef9-40ba-a7b3-e18c2b466390, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
+      "Document(id=b46045ba-3ebb-4e66-93d5-ece2d6ace3de, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
+      "Document(id=eba5555b-e6d6-4ca1-8452-af22295e68f8, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
+     ]
     },
-    "language_info": {
-      "name": "python"
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
     }
+   ],
+   "source": [
+    "from adalflow.components.data_process.text_splitter import TextSplitter\n",
+    "from adalflow.core.types import Document\n",
+    "\n",
+    "# Configure the splitter settings\n",
+    "text_splitter = TextSplitter(\n",
+    "    split_by=\"word\",\n",
+    "    chunk_size=5,\n",
+    "    chunk_overlap=1\n",
+    ")\n",
+    "\n",
+    "# Example document\n",
+    "doc = Document(\n",
+    "    text=\"Example text. More example text. Even more text to illustrate.\",\n",
+    "    id=\"doc1\"\n",
+    ")\n",
+    "\n",
+    "# Execute the splitting\n",
+    "splitted_docs = text_splitter.call(documents=[doc])\n",
+    "\n",
+    "for doc in splitted_docs:\n",
+    "    print(doc)"
+   ]
   },
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "A99Pp0T7A9BM"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install adalflow[openai,groq,faiss-cpu]"
-      ]
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "LioyB3eCAOs8",
+    "outputId": "11cddc1c-608a-4027-830f-fe30a882a766"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "\n",
-        "from getpass import getpass\n",
-        "\n",
-        "# You can use a setup_env file to set the OPENAI_API_KEY too\n",
-        "# (ensure you setup OPENAI_API_KEY in your project .env file) using the following commands:\n",
-        "# from adalflow.utils import setup_env\n",
-        "\n",
-        "# Prompt user to enter their API keys securely\n",
-        "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
-        "\n",
-        "# Set environment variables\n",
-        "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
-        "\n",
-        "print(\"API keys have been set.\")"
-      ],
-      "metadata": {
-        "id": "y2SVUBNeBMy5"
-      },
-      "execution_count": null,
-      "outputs": []
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 489.02it/s]"
+     ]
     },
     {
-      "cell_type": "code",
-      "source": [
-        "from adalflow.components.data_process.text_splitter import TextSplitter\n",
-        "from adalflow.core.types import Document\n",
-        "\n",
-        "# Configure the splitter settings\n",
-        "text_splitter = TextSplitter(\n",
-        "    split_by=\"word\",\n",
-        "    chunk_size=5,\n",
-        "    chunk_overlap=1\n",
-        ")\n",
-        "\n",
-        "# Example document\n",
-        "doc = Document(\n",
-        "    text=\"Example text. More example text. Even more text to illustrate.\",\n",
-        "    id=\"doc1\"\n",
-        ")\n",
-        "\n",
-        "# Execute the splitting\n",
-        "splitted_docs = text_splitter.call(documents=[doc])\n",
-        "\n",
-        "for doc in splitted_docs:\n",
-        "    print(doc)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "RWWG9WRt2r9L",
-        "outputId": "faad52a8-47f5-48bc-e2c3-17a5aea21254"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 788.85it/s]"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Document(id=6374a3e5-2ef9-40ba-a7b3-e18c2b466390, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
-            "Document(id=b46045ba-3ebb-4e66-93d5-ece2d6ace3de, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
-            "Document(id=eba5555b-e6d6-4ca1-8452-af22295e68f8, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "\n"
-          ]
-        }
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document(id=b0c308f2-73d2-44cf-aaf2-63e8f87198e4, text='Example text. More example', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
+      "Document(id=3a37adff-c8ac-4cff-8b5e-9c68e0de9772, text=' text. Even more text', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
+      "Document(id=e1b56768-7918-4a94-8f08-a01161cb2dcf, text=' to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "source": [
-        "from adalflow.components.data_process.text_splitter import TextSplitter\n",
-        "from adalflow.core.types import Document\n",
-        "import tiktoken\n",
-        "\n",
-        "# Configure the splitter settings\n",
-        "text_splitter = TextSplitter(\n",
-        "    split_by=\"token\",\n",
-        "    chunk_size=5,\n",
-        "    chunk_overlap=0\n",
-        ")\n",
-        "\n",
-        "doc = Document(\n",
-        "    text=\"Example text. More example text. Even more text to illustrate.\",\n",
-        "    id = \"doc1\"\n",
-        "    )\n",
-        "\n",
-        "splitted_docs = (text_splitter.call(documents=[doc]))\n",
-        "\n",
-        "for doc in splitted_docs:\n",
-        "    print(doc)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "LioyB3eCAOs8",
-        "outputId": "11cddc1c-608a-4027-830f-fe30a882a766"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 489.02it/s]"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Document(id=b0c308f2-73d2-44cf-aaf2-63e8f87198e4, text='Example text. More example', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
-            "Document(id=3a37adff-c8ac-4cff-8b5e-9c68e0de9772, text=' text. Even more text', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
-            "Document(id=e1b56768-7918-4a94-8f08-a01161cb2dcf, text=' to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "\n"
-          ]
-        }
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
     }
-  ]
+   ],
+   "source": [
+    "from adalflow.components.data_process.text_splitter import TextSplitter\n",
+    "from adalflow.core.types import Document\n",
+    "\n",
+    "# Configure the splitter settings\n",
+    "text_splitter = TextSplitter(\n",
+    "    split_by=\"token\",\n",
+    "    chunk_size=5,\n",
+    "    chunk_overlap=0\n",
+    ")\n",
+    "\n",
+    "doc = Document(\n",
+    "    text=\"Example text. More example text. Even more text to illustrate.\",\n",
+    "    id = \"doc1\"\n",
+    "    )\n",
+    "\n",
+    "splitted_docs = (text_splitter.call(documents=[doc]))\n",
+    "\n",
+    "for doc in splitted_docs:\n",
+    "    print(doc)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/poetry.lock b/poetry.lock
index 92749681..e4788fc3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -470,6 +470,50 @@ charset-normalizer = ["charset-normalizer"]
 html5lib = ["html5lib"]
 lxml = ["lxml"]
 
+[[package]]
+name = "black"
+version = "24.10.0"
+description = "The uncompromising code formatter."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"},
+    {file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"},
+    {file = "black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:649fff99a20bd06c6f727d2a27f401331dc0cc861fb69cde910fe95b01b5928f"},
+    {file = "black-24.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe4d6476887de70546212c99ac9bd803d90b42fc4767f058a0baa895013fbb3e"},
+    {file = "black-24.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a2221696a8224e335c28816a9d331a6c2ae15a2ee34ec857dcf3e45dbfa99ad"},
+    {file = "black-24.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9da3333530dbcecc1be13e69c250ed8dfa67f43c4005fb537bb426e19200d50"},
+    {file = "black-24.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4007b1393d902b48b36958a216c20c4482f601569d19ed1df294a496eb366392"},
+    {file = "black-24.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:394d4ddc64782e51153eadcaaca95144ac4c35e27ef9b0a42e121ae7e57a9175"},
+    {file = "black-24.10.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e39e0fae001df40f95bd8cc36b9165c5e2ea88900167bddf258bacef9bbdc3"},
+    {file = "black-24.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d37d422772111794b26757c5b55a3eade028aa3fde43121ab7b673d050949d65"},
+    {file = "black-24.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14b3502784f09ce2443830e3133dacf2c0110d45191ed470ecb04d0f5f6fcb0f"},
+    {file = "black-24.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:30d2c30dc5139211dda799758559d1b049f7f14c580c409d6ad925b74a4208a8"},
+    {file = "black-24.10.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cbacacb19e922a1d75ef2b6ccaefcd6e93a2c05ede32f06a21386a04cedb981"},
+    {file = "black-24.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1f93102e0c5bb3907451063e08b9876dbeac810e7da5a8bfb7aeb5a9ef89066b"},
+    {file = "black-24.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ddacb691cdcdf77b96f549cf9591701d8db36b2f19519373d60d31746068dbf2"},
+    {file = "black-24.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:680359d932801c76d2e9c9068d05c6b107f2584b2a5b88831c83962eb9984c1b"},
+    {file = "black-24.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:17374989640fbca88b6a448129cd1745c5eb8d9547b464f281b251dd00155ccd"},
+    {file = "black-24.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:63f626344343083322233f175aaf372d326de8436f5928c042639a4afbbf1d3f"},
+    {file = "black-24.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfa1d0cb6200857f1923b602f978386a3a2758a65b52e0950299ea014be6800"},
+    {file = "black-24.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cd9c95431d94adc56600710f8813ee27eea544dd118d45896bb734e9d7a0dc7"},
+    {file = "black-24.10.0-py3-none-any.whl", hash = "sha256:3bb2b7a1f7b685f85b11fed1ef10f8a9148bceb49853e47a294a3dd963c1dd7d"},
+    {file = "black-24.10.0.tar.gz", hash = "sha256:846ea64c97afe3bc677b761787993be4991810ecc7a4a937816dd6bddedc4875"},
+]
+
+[package.dependencies]
+click = ">=8.0.0"
+mypy-extensions = ">=0.4.3"
+packaging = ">=22.0"
+pathspec = ">=0.9.0"
+platformdirs = ">=2"
+
+[package.extras]
+colorama = ["colorama (>=0.4.3)"]
+d = ["aiohttp (>=3.10)"]
+jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
+uvloop = ["uvloop (>=0.15.2)"]
+
 [[package]]
 name = "bleach"
 version = "6.1.0"
@@ -3601,6 +3645,17 @@ files = [
 qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"]
 testing = ["docopt", "pytest"]
 
+[[package]]
+name = "pathspec"
+version = "0.12.1"
+description = "Utility library for gitignore style pattern matching of file paths."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
+    {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
+]
+
 [[package]]
 name = "pexpect"
 version = "4.9.0"
@@ -4744,6 +4799,33 @@ files = [
     {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
 ]
 
+[[package]]
+name = "ruff"
+version = "0.8.0"
+description = "An extremely fast Python linter and code formatter, written in Rust."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "ruff-0.8.0-py3-none-linux_armv6l.whl", hash = "sha256:fcb1bf2cc6706adae9d79c8d86478677e3bbd4ced796ccad106fd4776d395fea"},
+    {file = "ruff-0.8.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:295bb4c02d58ff2ef4378a1870c20af30723013f441c9d1637a008baaf928c8b"},
+    {file = "ruff-0.8.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7b1f1c76b47c18fa92ee78b60d2d20d7e866c55ee603e7d19c1e991fad933a9a"},
+    {file = "ruff-0.8.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb0d4f250a7711b67ad513fde67e8870109e5ce590a801c3722580fe98c33a99"},
+    {file = "ruff-0.8.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e55cce9aa93c5d0d4e3937e47b169035c7e91c8655b0974e61bb79cf398d49c"},
+    {file = "ruff-0.8.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f4cd64916d8e732ce6b87f3f5296a8942d285bbbc161acee7fe561134af64f9"},
+    {file = "ruff-0.8.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c5c1466be2a2ebdf7c5450dd5d980cc87c8ba6976fb82582fea18823da6fa362"},
+    {file = "ruff-0.8.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2dabfd05b96b7b8f2da00d53c514eea842bff83e41e1cceb08ae1966254a51df"},
+    {file = "ruff-0.8.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:facebdfe5a5af6b1588a1d26d170635ead6892d0e314477e80256ef4a8470cf3"},
+    {file = "ruff-0.8.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87a8e86bae0dbd749c815211ca11e3a7bd559b9710746c559ed63106d382bd9c"},
+    {file = "ruff-0.8.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:85e654f0ded7befe2d61eeaf3d3b1e4ef3894469cd664ffa85006c7720f1e4a2"},
+    {file = "ruff-0.8.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:83a55679c4cb449fa527b8497cadf54f076603cc36779b2170b24f704171ce70"},
+    {file = "ruff-0.8.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:812e2052121634cf13cd6fddf0c1871d0ead1aad40a1a258753c04c18bb71bbd"},
+    {file = "ruff-0.8.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:780d5d8523c04202184405e60c98d7595bdb498c3c6abba3b6d4cdf2ca2af426"},
+    {file = "ruff-0.8.0-py3-none-win32.whl", hash = "sha256:5fdb6efecc3eb60bba5819679466471fd7d13c53487df7248d6e27146e985468"},
+    {file = "ruff-0.8.0-py3-none-win_amd64.whl", hash = "sha256:582891c57b96228d146725975fbb942e1f30a0c4ba19722e692ca3eb25cc9b4f"},
+    {file = "ruff-0.8.0-py3-none-win_arm64.whl", hash = "sha256:ba93e6294e9a737cd726b74b09a6972e36bb511f9a102f1d9a7e1ce94dd206a6"},
+    {file = "ruff-0.8.0.tar.gz", hash = "sha256:a7ccfe6331bf8c8dad715753e157457faf7351c2b69f62f32c165c2dbcbacd44"},
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.10.2"
@@ -6130,4 +6212,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11, <4.0"
-content-hash = "e37147771ba46212166b62327cb83b76012da7ddbcc57aa5088ff7418eef2393"
+content-hash = "6f206712012373417ade22f41508b5111eb14b3520a14ac76a125d51718e3a54"
diff --git a/pyproject.toml b/pyproject.toml
index 9288a31c..abdd799c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,10 +17,7 @@ packages = [
 [tool.poetry.dependencies]
 python = ">=3.11, <4.0"
 adalflow = { path = "adalflow", develop = true }
-# torch = "^2.3.1"
 openai = "^1.34.0"
-# lightrag = {path = "lightrag/dist/lightrag-0.0.0a11-py3-none-any.whl"}
-# lightrag = "^0.0.0a13"
 
 
 [tool.poetry.group.dev.dependencies]
@@ -51,8 +48,24 @@ faiss-cpu = "^1.8.0.post1"
 nltk = "^3.9.1"
 ragas = "^0.1.16"
 colorama = "^0.4.6"
+black = "^24.10.0"
+ruff = "^0.8.0"
 
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
+
+
+# for formatting and linting
+[tool.black]
+line-length = 88
+target-version = ["py311"]
+
+[tool.ruff]
+lint.extend-ignore = [
+    "E402",  # Ignore module-level import issues
+    "E731",
+    "UP007", # Wants | over Union, which breaks 3.8
+]
+line-length = 88
diff --git a/tutorials/component.ipynb b/tutorials/component.ipynb
index 17e371a4..ccfe51e1 100644
--- a/tutorials/component.ipynb
+++ b/tutorials/component.ipynb
@@ -6,11 +6,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import re\n",
     "from adalflow.core import Component, Generator\n",
-    "from adalflow.components.model_client import OpenAIClient\n",
-    "from adalflow.components.model_client import GroqAPIClient\n",
-    "from adalflow.utils import setup_env # make sure you have a .env file with OPENAI_API_KEY and GROQ_API_KEY"
+    "from adalflow.components.model_client import OpenAIClient"
    ]
   },
   {
@@ -74,10 +71,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "doc = DocQA()"
+   ]
   },
   {
    "cell_type": "code",
@@ -235,7 +234,7 @@
     "save_pickle(states, \"doc.pkl\")\n",
     "\n",
     "# load the serialized states from a file\n",
-    "from adalflow.utils.file_io import load_pickle, load_json\n",
+    "from adalflow.utils.file_io import load_pickle\n",
     "states_loaded = load_pickle(\"doc.pkl\")\n",
     "# states_loaded = load_json(\"doc.json\")\n",
     "\n",
@@ -463,7 +462,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from adalflow.utils.file_io import save_json\n",
     "\n",
     "save_json(doc.to_dict(), \"doc.json\")"
    ]
@@ -624,7 +622,7 @@
     }
    ],
    "source": [
-    "from adalflow.core.component import Sequential\n",
+    "from adalflow import Sequential\n",
     "\n",
     "@fun_to_component\n",
     "def enhance_query(query:str) -> str:\n",
diff --git a/tutorials/dataclass.ipynb b/tutorials/dataclass.ipynb
index 1b8cc519..e2631c2b 100644
--- a/tutorials/dataclass.ipynb
+++ b/tutorials/dataclass.ipynb
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -77,18 +77,18 @@
     }
    ],
    "source": [
-    "# it does not allow required field after optional field\n",
-    "@dataclass\n",
-    "class TrecData2:\n",
-    "    question: Question = field(\n",
-    "        metadata={\"desc\": \"The question asked by the user\"}\n",
-    "    ) # Required field, you have to provide the question field at the instantiation\n",
-    "    label: int = field(\n",
-    "        metadata={\"desc\": \"The label of the question\"}, default=0\n",
-    "    ) # Optional field\n",
-    "    metadata: dict = field(\n",
-    "        metadata={\"desc\": \"The metadata of the question\"}\n",
-    "    ) # required field"
+    "# # it does not allow required field after optional field\n",
+    "# @dataclass\n",
+    "# class TrecData2:\n",
+    "#     question: Question = field(\n",
+    "#         metadata={\"desc\": \"The question asked by the user\"}\n",
+    "#     ) # Required field, you have to provide the question field at the instantiation\n",
+    "#     label: int = field(\n",
+    "#         metadata={\"desc\": \"The label of the question\"}, default=0\n",
+    "#     ) # Optional field\n",
+    "#     metadata: dict = field(\n",
+    "#         metadata={\"desc\": \"The metadata of the question\"}\n",
+    "#     ) # required field"
    ]
   },
   {
diff --git a/tutorials/embedder.ipynb b/tutorials/embedder.ipynb
index b7a5c714..29625454 100644
--- a/tutorials/embedder.ipynb
+++ b/tutorials/embedder.ipynb
@@ -15,7 +15,6 @@
    "source": [
     "from adalflow.core.embedder import Embedder\n",
     "from adalflow.components.model_client import OpenAIClient\n",
-    "from adalflow.utils import setup_env # ensure you setup OPENAI_API_KEY in your project .env file\n",
     "\n",
     "model_kwargs = {\n",
     "    \"model\": \"text-embedding-3-small\",\n",
diff --git a/tutorials/generator.ipynb b/tutorials/generator.ipynb
index 1bf47865..e8a3fac2 100644
--- a/tutorials/generator.ipynb
+++ b/tutorials/generator.ipynb
@@ -44,7 +44,7 @@
    ],
    "source": [
     "from adalflow.core import Generator\n",
-    "from adalflow.components.model_client import OpenAIClient, get_all_messages_content, get_probabilities\n",
+    "from adalflow.components.model_client import OpenAIClient, get_probabilities\n",
     "from adalflow.utils import enable_library_logging\n",
     "\n",
     "enable_library_logging(level=\"DEBUG\")\n",
@@ -78,7 +78,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from adalflow.core import Component, Generator, Prompt\n",
+    "from adalflow.core import Component, Generator\n",
     "from adalflow.components.model_client import GroqAPIClient\n",
     "from adalflow.utils import setup_env # noqa\n",
     "\n",
diff --git a/tutorials/model_client.ipynb b/tutorials/model_client.ipynb
index 3228d8c1..3e5b7b06 100644
--- a/tutorials/model_client.ipynb
+++ b/tutorials/model_client.ipynb
@@ -26,7 +26,6 @@
    "source": [
     "from adalflow.components.model_client import OpenAIClient\n",
     "from adalflow.core.types import ModelType\n",
-    "from adalflow.utils import setup_env\n",
     "\n",
     "openai_client = OpenAIClient()\n",
     "\n",
diff --git a/tutorials/react_note.ipynb b/tutorials/react_note.ipynb
index 2d0f2be0..0b647a4b 100644
--- a/tutorials/react_note.ipynb
+++ b/tutorials/react_note.ipynb
@@ -8,7 +8,7 @@
    "source": [
     "from adalflow.components.agent import ReActAgent\n",
     "from adalflow.core import Generator, ModelClientType, ModelClient\n",
-    "from adalflow.utils import setup_env, get_logger\n",
+    "from adalflow.utils import setup_env\n",
     "\n",
     "# get_logger(level=\"DEBUG\")\n",
     "\n",
diff --git a/tutorials/retriever.ipynb b/tutorials/retriever.ipynb
index 413dc465..c464f46b 100644
--- a/tutorials/retriever.ipynb
+++ b/tutorials/retriever.ipynb
@@ -536,7 +536,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -551,7 +551,7 @@
    ],
    "source": [
     "# try to use title this time\n",
-    "document_map_func = lambda x: x[\"title\"] + \" \" + x[\"content\"]\n",
+    "document_map_func = lambda x: x[\"title\"] + \" \" + x[\"content\"] # no \n",
     "\n",
     "reranker.build_index_from_documents(documents=documents, document_map_func=document_map_func)\n",
     "\n",
@@ -1300,7 +1300,6 @@
     "\n",
     "from adalflow.tracing import trace_generator_call\n",
     "\n",
-    "from adalflow.utils import setup_env\n",
     "\n",
     "# 1. set up the tracing for failed call as the retriever has generator attribute\n",
     "\n",
diff --git a/tutorials/tools.ipynb b/tutorials/tools.ipynb
index 3490fe7d..c32b9420 100644
--- a/tutorials/tools.ipynb
+++ b/tutorials/tools.ipynb
@@ -17,7 +17,6 @@
    "source": [
     "from openai import OpenAI\n",
     "import json\n",
-    "from adalflow.utils import setup_env\n",
     "\n",
     "client = OpenAI()\n",
     "\n",
@@ -106,7 +105,7 @@
    "outputs": [],
    "source": [
     "from dataclasses import dataclass\n",
-    "from typing import Any, Dict, List, Tuple\n",
+    "from typing import Any, Dict, List\n",
     "import numpy as np\n",
     "import time\n",
     "import asyncio\n",
@@ -445,12 +444,10 @@
    "source": [
     "# call all the above functions \n",
     "import nest_asyncio\n",
-    "import asyncio\n",
     "\n",
     "nest_asyncio.apply()\n",
     "\n",
     "\n",
-    "import time\n",
     "\n",
     "async def async_function_1():\n",
     "    await asyncio.sleep(1)\n",
@@ -1290,13 +1287,7 @@
     }
    ],
    "source": [
-    "import ast\n",
-    "import builtins\n",
-    "import contextlib\n",
-    "import ctypes\n",
-    "import sys\n",
     "import threading\n",
-    "import time\n",
     "\n",
     "# Define a list of safe built-ins\n",
     "SAFE_BUILTINS = {\n",
@@ -1787,7 +1778,6 @@
    "source": [
     "queries = [\"add 2 and 3\", \"search for something\", \"add points (1, 2) and (3, 4)\", \"sum numpy array with arr = np.array([[1, 2], [3, 4]])\", \"multiply 2 with local variable x\", \"divide 2 by 3\"]\n",
     "\n",
-    "from adalflow.components.output_parsers import ListOutputParser\n",
     "from adalflow.core.string_parser import JsonParser # improve a list of json\n",
     "\n",
     "preset_prompt_kwargs = {\n",
@@ -1982,9 +1972,6 @@
     "# first check the openai's function call apis\n",
     "\n",
     "from openai import OpenAI\n",
-    "from openai.types import FunctionDefinition\n",
-    "from adalflow.utils import setup_env\n",
-    "import json\n",
     "\n",
     "client = OpenAI()\n",
     "\n",
@@ -2242,8 +2229,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "adalflow_fn_schema =\n",
-    "{\n",
+    "adalflow_fn_schema ={\n",
     "        \"type\": \"object\",\n",
     "        \"properties\": {\n",
     "            \"weather\": {\n",
@@ -2284,31 +2270,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "    llama_fn_schema = {\n",
-    "        \"type\": \"object\",\n",
-    "        \"properties\": {\"weather\": {\"$ref\": \"#/definitions/Weather\"}},\n",
-    "        \"required\": [\"weather\"],\n",
-    "        \"definitions\": {\n",
-    "            \"Weather\": {\n",
-    "                \"title\": \"Weather\",\n",
-    "                \"type\": \"object\",\n",
-    "                \"properties\": {\n",
-    "                    \"location\": {\n",
-    "                        \"title\": \"Location\",\n",
-    "                        \"desc\": \"The city and state, e.g. San Francisco, CA\",\n",
-    "                        \"type\": \"string\",\n",
-    "                    },\n",
-    "                    \"unit\": {\n",
-    "                        \"title\": \"Unit\",\n",
-    "                        \"enum\": [\"celsius\", \"fahrenheit\"],\n",
-    "                        \"type\": \"string\",\n",
-    "                    },\n",
+    "llama_fn_schema = {\n",
+    "    \"type\": \"object\",\n",
+    "    \"properties\": {\"weather\": {\"$ref\": \"#/definitions/Weather\"}},\n",
+    "    \"required\": [\"weather\"],\n",
+    "    \"definitions\": {\n",
+    "        \"Weather\": {\n",
+    "            \"title\": \"Weather\",\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"location\": {\n",
+    "                    \"title\": \"Location\",\n",
+    "                    \"desc\": \"The city and state, e.g. San Francisco, CA\",\n",
+    "                    \"type\": \"string\",\n",
     "                },\n",
-    "                \"required\": [\"location\", \"unit\"],\n",
-    "                \"additionalProperties\": false,\n",
-    "            }\n",
-    "        },\n",
-    "    }"
+    "                \"unit\": {\n",
+    "                    \"title\": \"Unit\",\n",
+    "                    \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                    \"type\": \"string\",\n",
+    "                },\n",
+    "            },\n",
+    "            \"required\": [\"location\", \"unit\"],\n",
+    "            \"additionalProperties\": False,\n",
+    "        }\n",
+    "    },\n",
+    "}"
    ]
   },
   {
@@ -2319,7 +2305,6 @@
    "source": [
     "# level 1, call function with default python data types\n",
     "# such as str, int, float, list, dict, etc.\n",
-    "\n",
     "def _get_current_weather(location: str, unit: str = \"fahrenheit\"):\n",
     "    \"\"\"Get the current weather in a given location\"\"\"\n",
     "    if \"tokyo\" in location.lower():\n",
diff --git a/use_cases/classification/train.py b/use_cases/classification/train.py
index f287c164..0bdbd562 100644
--- a/use_cases/classification/train.py
+++ b/use_cases/classification/train.py
@@ -126,7 +126,7 @@ def train(
         debug=False,
         max_steps=12,
         strategy="constrained",
-        optimization_order="sequential"
+        optimization_order="sequential",
     )
     # val 0.694 -> 0.833, #test 0.8472 -> 0.833, adding more shots does not help
     # NOTE: raw: 40, bootstrap: 4, max_steps: 8, strategy: random, val: 86.1, test: 86.8 (+4.2% compared with dspy)
diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py
index c4c64fbc..280f7c1a 100644
--- a/use_cases/question_answering/bbh/object_count/train_new.py
+++ b/use_cases/question_answering/bbh/object_count/train_new.py
@@ -111,7 +111,7 @@ def train(
         **gpt_3_model,
         teacher_model_config=gpt_4o_model,
         text_optimizer_model_config=gpt_4o_model,
-        backward_engine_model_config=gpt_4o_model
+        backward_engine_model_config=gpt_4o_model,
     )
     print(adal_component)
     trainer = adal.Trainer(