From d5dc0ae3b72c9779b3e06fe62b984f3cee1e4a2a Mon Sep 17 00:00:00 2001 From: Vinay Raman Date: Tue, 12 Nov 2024 14:31:48 -0800 Subject: [PATCH] added table of contents to notebook, tested main.py Signed-off-by: Vinay Raman --- .../__init__.py | 0 .../config/__init__.py | 0 .../main.py | 32 +- .../notebooks/quickstart.ipynb | 405 +++++++++--------- .../retriever_evalset_generator.py | 14 +- 5 files changed, 240 insertions(+), 211 deletions(-) create mode 100644 tutorials/nemo-retriever-synthetic-data-generation/__init__.py create mode 100644 tutorials/nemo-retriever-synthetic-data-generation/config/__init__.py diff --git a/tutorials/nemo-retriever-synthetic-data-generation/__init__.py b/tutorials/nemo-retriever-synthetic-data-generation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/__init__.py b/tutorials/nemo-retriever-synthetic-data-generation/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tutorials/nemo-retriever-synthetic-data-generation/main.py b/tutorials/nemo-retriever-synthetic-data-generation/main.py index eb27b024d..a385760c8 100644 --- a/tutorials/nemo-retriever-synthetic-data-generation/main.py +++ b/tutorials/nemo-retriever-synthetic-data-generation/main.py @@ -13,36 +13,49 @@ # limitations under the License. import argparse +import importlib import os import shutil from typing import Any, List -from retriever_evalset_generator import RetrieverEvalSetGenerator from tqdm.dask import TqdmCallback from nemo_curator import AsyncOpenAIClient, ScoreFilter, Sequential from nemo_curator.datasets import DocumentDataset from nemo_curator.filters import AnswerabilityFilter, EasinessFilter -from nemo_curator.modules.config import RetrieverEvalSDGConfig from nemo_curator.modules.filter import Score, ScoreFilter +config = importlib.import_module( + "tutorials.nemo-retriever-synthetic-data-generation.config.config" +) +retriever_evalset_generator = importlib.import_module( + "tutorials.nemo-retriever-synthetic-data-generation.retriever_evalset_generator" +) + def get_pipeline(args: Any) -> Any: - cfg = RetrieverEvalSDGConfig.from_yaml(args.pipeline_config) + cfg = config.RetrieverEvalSDGConfig.from_yaml(args.pipeline_config) # update api_key from input args cfg.api_key = args.api_key sdg_pipeline = Sequential( [ - RetrieverEvalSetGenerator(cfg), + retriever_evalset_generator.RetrieverEvalSetGenerator(cfg), ] ) filters = [] if cfg.easiness_filter: filters.append( ScoreFilter( - EasinessFilter(cfg), + EasinessFilter( + cfg.base_url, + cfg.api_key, + cfg.easiness_filter, + cfg.percentile, + cfg.truncate, + cfg.batch_size, + ), text_field=["text", "question"], score_field="easiness_scores", ) @@ -50,7 +63,14 @@ def get_pipeline(args: Any) -> Any: if cfg.answerability_filter: filters.append( ScoreFilter( - AnswerabilityFilter(cfg), + AnswerabilityFilter( + cfg.base_url, + cfg.api_key, + cfg.answerability_filter, + cfg.answerability_system_prompt, + cfg.answerability_user_prompt_template, + cfg.num_criteria, + ), text_field=["text", "question"], score_field="answerability_scores", ) diff --git a/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb b/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb index b8eb1254f..db60388e0 100644 --- a/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb +++ b/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb @@ -6,18 +6,33 @@ "source": [ "# Synthetic Evaluation Data Generation\n", "\n", - "\n", - "## Quickstart\n", - "\n", + "## Table of Contents\n", + "1. [Install required libraries](#Install-required-libraries)\n", + "2. [Prepare input data](#Prepare-input-data)\n", + "3. [Generate API key](#Generating-API-key)\n", + "4. [Loading dataset](#Loading-datasets)\n", + "5. [Reading pipeline config](#Read-pipeline-config)\n", + "6. [Data Generation](#Running-the-Synthetic-Data-Generator)\n", + "7. [Data Quality Assessment](#Data-Quality-Assessment)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "### Install required libraries\n", "\n", "```\n", "$ pip install -r requirements.txt\n", "```\n", "\n", - "Please also see [README.md](../README.md) for environment setup including necessary library installation.\n", - "\n", - "\n", + "Please also see [README.md](../README.md) for environment setup including necessary library installation.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "### Prepare input data\n", "\n", "The synthetic data generation framework supports two input formats `rawdoc` or `squad`. \n", @@ -37,62 +52,33 @@ "{\"_id\": \"doc3\", \"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n", "...\n", "```\n", - "This repository contains a sample JSONL file `data/sample_data.jsonl`.\n", - "\n", - "\n", - "- `input_format=squad`\n", - "\n", - "If you have manually created questions and would like to conduct further analysis (correlation between synthetic questions and original questions), the input data should follow the SQuAD format.\n", - "\n", - "```\n", - " {\n", - " \"data\": [\n", - " {\n", - " \"paragraphs\": [\n", - " {\n", - " \"context\": \"The quick brown fox jumps over the lazy dog.\",\n", - " \"qas\": [\n", - " {\n", - " \"question\": \"What does the fox jump over?\",\n", - " \"id\": \"q1\",\n", - " \"synthetic\": true,\n", - " \"answers\": [\n", - " {\n", - " \"text\": \"The fox jump over the lazy dog\",\n", - " \"answer_start\": -1, # For generative answers\n", - " \"synthetic\": true,\n", - " }\n", - " ]\n", - " }\n", - " ]\n", - " }\n", - " ],\n", - " \"title\": \"Example\"\n", - " }\n", - " ],\n", - " \"version\": \"2.0\"\n", - " } \n", - "```" + "This repository contains a sample JSONL file `data/sample_data.jsonl`." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "from omegaconf import OmegaConf\n", "import sys\n", - "sys.path.append(\"../\")\n", - "sys.path.append(\"../../../\")\n", - "from retriever_evalset_generator import RetrieverEvalSetGenerator\n", - "from filters import EasinessFilter, AnswerabilityFilter\n", + "import importlib\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "\n", + "from nemo_curator.filters.synthetic import EasinessFilter, AnswerabilityFilter\n", "from nemo_curator.modules.filter import ScoreFilter, Score\n", "from nemo_curator.datasets import DocumentDataset\n", - "from nemo_curator.modules.config import RetrieverEvalSDGConfig\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n" + "\n", + "config = importlib.import_module(\n", + " \"tutorials.nemo-retriever-synthetic-data-generation.config.config\"\n", + ")\n", + "retriever_evalset_generator = importlib.import_module(\n", + " \"tutorials.nemo-retriever-synthetic-data-generation.retriever_evalset_generator\"\n", + ")" ] }, { @@ -118,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -128,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -202,7 +188,7 @@ "4 The Colosseum - Ancient Roman Architecture " ] }, - "execution_count": 17, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -215,7 +201,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Read pipeline config file, instantiate Generator Object" + "### Read pipeline config" ] }, { @@ -224,9 +210,26 @@ "metadata": {}, "outputs": [], "source": [ - "cfg = RetrieverEvalSDGConfig.from_yaml(\"../config/config.yaml\")\n", + "cfg = config.RetrieverEvalSDGConfig.from_yaml(\"../config/config.yaml\")\n", "cfg.api_key = \"your api key here\"\n", - "retrieval_evalset_generator = RetrieverEvalSetGenerator(cfg)" + "retrieval_evalset_generator = retriever_evalset_generator.RetrieverEvalSetGenerator(cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generator model used = mistralai/mixtral-8x22b-instruct-v0.1\n" + ] + } + ], + "source": [ + "print (f\"Generator model used = {cfg.generator_model}\")" ] }, { @@ -239,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -258,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -282,11 +285,11 @@ " \n", " \n", " \n", - " _id\n", " text\n", " title\n", - " question-id\n", " question\n", + " _id\n", + " question-id\n", " answer\n", " score\n", " \n", @@ -294,52 +297,52 @@ " \n", " \n", " 0\n", - " f95b082088c0bec2d0dc98a3799f31b3ebc050517cbfd3...\n", " The Eiffel Tower is an iconic landmark of Pari...\n", " Eiffel Tower - A French Icon\n", - " ff9de122e00206eebb733e02e7d38ba50e9e9c640e5128...\n", " What is the significance of the Eiffel Tower i...\n", + " 342d2d470596528b192b9f0a12d0ec5f4798ab1fc84090...\n", + " c6075864cc0c9318df5456c2b06bfb581562542205ff99...\n", " The Eiffel Tower is an iconic landmark in Pari...\n", " 1\n", " \n", " \n", " 1\n", - " a8b7a30bebd2693a4f797455bb06d720613f72a11d3aaf...\n", " The Eiffel Tower is an iconic landmark of Pari...\n", " Eiffel Tower - A French Icon\n", - " d1cff3a0aaefc5ee61462131ac196f81b725f41458d3b6...\n", " Who was responsible for designing the Eiffel T...\n", + " 12dcafeb731d5ef4e1903f1e6cc35bfa9d5e40f740e967...\n", + " 003de77e8d7a0d499d75edfc5ad4633d4a2703b89c1f09...\n", " The Eiffel Tower was designed by the engineer ...\n", " 1\n", " \n", " \n", " 2\n", - " 24f82d3e8cc1227c8b4b50f0912192582724a754b48601...\n", " The Eiffel Tower is an iconic landmark of Pari...\n", " Eiffel Tower - A French Icon\n", - " 651eb12e56f0f40473acd24abb4199760ae746b16bdd48...\n", - " What was the occasion for building the Eiffel ...\n", - " The Eiffel Tower was built for the 1889 Exposi...\n", + " When was the Eiffel Tower built and for what p...\n", + " e5d22c48da4684bf5da4afe414d2d6630709e5b134b847...\n", + " eb5bfbf35e7d53cc2affc58146721a017c72c38344ca1d...\n", + " The Eiffel Tower was built in 1889 for the Exp...\n", " 1\n", " \n", " \n", " 3\n", - " b4c53eb52d1463892cc90f45be86de7e2bc8b34d349e21...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " ef457b9db10d45b1f5dc22c9751029576e41cbc09c9eb6...\n", - " What is the purpose of the Great Wall of China?\n", - " The purpose of the Great Wall of China is to p...\n", + " What materials were used to construct the Grea...\n", + " dab619e293076e8119d9dd0d0ea4a69bf0fff0f526951f...\n", + " 03c619187f0aae660725a45533184a2ccf58ebb264d92a...\n", + " The Great Wall of China was constructed using ...\n", " 1\n", " \n", " \n", " 4\n", - " 35f021445b7dd578f0e1ae1fd6e21f8bd2b76cb159a5a1...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " 3f180fd92f1aadef054b1481eeb5bbbaaa98d58992f3e7...\n", - " What materials were used to build the Great Wa...\n", - " The Great Wall of China was built using materi...\n", + " What was the primary purpose of building the G...\n", + " 329021930f100a10785cea69e4c1c42a965e5c1892b3ae...\n", + " b4d63625700e8f80dd0c42668eb1625d8c58e9716b904a...\n", + " The primary purpose of building the Great Wall...\n", " 1\n", " \n", " \n", @@ -347,13 +350,6 @@ "" ], "text/plain": [ - " _id \\\n", - "0 f95b082088c0bec2d0dc98a3799f31b3ebc050517cbfd3... \n", - "1 a8b7a30bebd2693a4f797455bb06d720613f72a11d3aaf... \n", - "2 24f82d3e8cc1227c8b4b50f0912192582724a754b48601... \n", - "3 b4c53eb52d1463892cc90f45be86de7e2bc8b34d349e21... \n", - "4 35f021445b7dd578f0e1ae1fd6e21f8bd2b76cb159a5a1... \n", - "\n", " text \\\n", "0 The Eiffel Tower is an iconic landmark of Pari... \n", "1 The Eiffel Tower is an iconic landmark of Pari... \n", @@ -368,29 +364,36 @@ "3 The Great Wall of China - Ancient Protection \n", "4 The Great Wall of China - Ancient Protection \n", "\n", - " question-id \\\n", - "0 ff9de122e00206eebb733e02e7d38ba50e9e9c640e5128... \n", - "1 d1cff3a0aaefc5ee61462131ac196f81b725f41458d3b6... \n", - "2 651eb12e56f0f40473acd24abb4199760ae746b16bdd48... \n", - "3 ef457b9db10d45b1f5dc22c9751029576e41cbc09c9eb6... \n", - "4 3f180fd92f1aadef054b1481eeb5bbbaaa98d58992f3e7... \n", - "\n", " question \\\n", "0 What is the significance of the Eiffel Tower i... \n", "1 Who was responsible for designing the Eiffel T... \n", - "2 What was the occasion for building the Eiffel ... \n", - "3 What is the purpose of the Great Wall of China? \n", - "4 What materials were used to build the Great Wa... \n", + "2 When was the Eiffel Tower built and for what p... \n", + "3 What materials were used to construct the Grea... \n", + "4 What was the primary purpose of building the G... \n", + "\n", + " _id \\\n", + "0 342d2d470596528b192b9f0a12d0ec5f4798ab1fc84090... \n", + "1 12dcafeb731d5ef4e1903f1e6cc35bfa9d5e40f740e967... \n", + "2 e5d22c48da4684bf5da4afe414d2d6630709e5b134b847... \n", + "3 dab619e293076e8119d9dd0d0ea4a69bf0fff0f526951f... \n", + "4 329021930f100a10785cea69e4c1c42a965e5c1892b3ae... \n", + "\n", + " question-id \\\n", + "0 c6075864cc0c9318df5456c2b06bfb581562542205ff99... \n", + "1 003de77e8d7a0d499d75edfc5ad4633d4a2703b89c1f09... \n", + "2 eb5bfbf35e7d53cc2affc58146721a017c72c38344ca1d... \n", + "3 03c619187f0aae660725a45533184a2ccf58ebb264d92a... \n", + "4 b4d63625700e8f80dd0c42668eb1625d8c58e9716b904a... \n", "\n", " answer score \n", "0 The Eiffel Tower is an iconic landmark in Pari... 1 \n", "1 The Eiffel Tower was designed by the engineer ... 1 \n", - "2 The Eiffel Tower was built for the 1889 Exposi... 1 \n", - "3 The purpose of the Great Wall of China is to p... 1 \n", - "4 The Great Wall of China was built using materi... 1 " + "2 The Eiffel Tower was built in 1889 for the Exp... 1 \n", + "3 The Great Wall of China was constructed using ... 1 \n", + "4 The primary purpose of building the Great Wall... 1 " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -415,15 +418,25 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "ef = EasinessFilter(cfg)\n", + "ef = EasinessFilter(cfg.base_url,\n", + " cfg.api_key,\n", + " cfg.easiness_filter,\n", + " cfg.percentile,\n", + " cfg.truncate,\n", + " cfg.batch_size)\n", "easiness_filter = ScoreFilter(ef,\n", " text_field = [\"text\", \"question\"],\n", " score_field = \"easiness_scores\")\n", - "af = AnswerabilityFilter(cfg)\n", + "af = AnswerabilityFilter(cfg.base_url,\n", + " cfg.api_key,\n", + " cfg.answerability_filter,\n", + " cfg.answerability_system_prompt,\n", + " cfg.answerability_user_prompt_template,\n", + " cfg.num_criteria)\n", "answerability_filter = ScoreFilter(af,\n", " text_field = [\"text\", \"question\"],\n", " score_field = \"answerability_scores\")" @@ -439,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -449,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -473,11 +486,11 @@ " \n", " \n", " \n", - " _id\n", " text\n", " title\n", - " question-id\n", " question\n", + " _id\n", + " question-id\n", " answer\n", " score\n", " easiness_scores\n", @@ -486,71 +499,64 @@ " \n", " \n", " 1\n", - " 7c5a346e2859af30eae493b5df189716db5355d75760a1...\n", " The Eiffel Tower is an iconic landmark of Pari...\n", " Eiffel Tower - A French Icon\n", - " fb39bc1a988e291cb8ac36a1acad08ed24d7969ab08ff0...\n", - " Who was responsible for designing the Eiffel T...\n", + " Who was the engineer behind the design of the ...\n", + " 5b31740eab0e66fa435ac3b2d0f3ad299e9bc885da22ad...\n", + " 985cd7b5de889c7b62eca2d45b83eac5c1ba6fa2dce681...\n", " The Eiffel Tower was designed by the engineer ...\n", " 1\n", - " 0.570407\n", + " 0.569564\n", " \n", " \n", " 3\n", - " e94a0488a45accbf70a59ca6d66180a5c771499dbd08a5...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " 24d74a34810676af68b0a4314d58733b10f79f6f75eb6c...\n", " What is the purpose of the Great Wall of China?\n", + " 2e40d9da383f39586c7f4a2e6cdc930de7ceaa1800d41c...\n", + " 108ee53f98dcba40d2e4654df9e41dadf313000b2cfbb0...\n", " The purpose of the Great Wall of China is to p...\n", " 1\n", " 0.527854\n", " \n", " \n", " 4\n", - " 0b16e4f1e3f3a027072b98284623f4d951870f12f46c95...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " 778edd3d72d428075464e7501fae1da599a46f83d09f99...\n", " What materials were used to build the Great Wa...\n", + " b05babced766cf6b65f43bc0d8c927d08a271d30423cd8...\n", + " a698316fccdb6facb8341372778863bb092fe71bc60357...\n", " The Great Wall of China was built using materi...\n", " 1\n", " 0.550470\n", " \n", " \n", " 5\n", - " f187bbe2592f4751233bba08aaac5a7321530de04dd19e...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " becf01b49eb0599e980b569d669b422882eededdcf6d5e...\n", - " What direction does the Great Wall of China ge...\n", - " The Great Wall of China generally follows an e...\n", + " What is the general direction of the Great Wal...\n", + " 88cd9adc26f148a24a1fbde7c5dfed1033db29c7ab997f...\n", + " a5996280c5a2b382c206ab4fc69b8981f7588a6c216b63...\n", + " The Great Wall of China was generally built al...\n", " 1\n", - " 0.506733\n", + " 0.462438\n", " \n", " \n", " 6\n", - " 7350f3fb9a7c31e8c62a3be39f3ba1e53ff08cbd6d0a00...\n", " The Taj Mahal is an ivory-white marble mausole...\n", " Taj Mahal - A Symbol of Love\n", - " eb0c1e84bd7c176e5a00eaa7c03af4db330fa97a3e8e2b...\n", - " What is the Taj Mahal and who built it?\n", - " The Taj Mahal is an ivory-white marble mausole...\n", + " What is the Taj Mahal primarily used for?\n", + " 4eaff3017898dab67377f19bef2cf7bbf7ee1223a661f7...\n", + " 1e2f14820a6f599a5d124f5cd6b0e2575a0fa601a36d5f...\n", + " The Taj Mahal is primarily used as a mausoleum...\n", " 1\n", - " 0.521735\n", + " 0.444493\n", " \n", " \n", "\n", "" ], "text/plain": [ - " _id \\\n", - "1 7c5a346e2859af30eae493b5df189716db5355d75760a1... \n", - "3 e94a0488a45accbf70a59ca6d66180a5c771499dbd08a5... \n", - "4 0b16e4f1e3f3a027072b98284623f4d951870f12f46c95... \n", - "5 f187bbe2592f4751233bba08aaac5a7321530de04dd19e... \n", - "6 7350f3fb9a7c31e8c62a3be39f3ba1e53ff08cbd6d0a00... \n", - "\n", " text \\\n", "1 The Eiffel Tower is an iconic landmark of Pari... \n", "3 The Great Wall of China is a series of fortifi... \n", @@ -565,29 +571,36 @@ "5 The Great Wall of China - Ancient Protection \n", "6 Taj Mahal - A Symbol of Love \n", "\n", - " question-id \\\n", - "1 fb39bc1a988e291cb8ac36a1acad08ed24d7969ab08ff0... \n", - "3 24d74a34810676af68b0a4314d58733b10f79f6f75eb6c... \n", - "4 778edd3d72d428075464e7501fae1da599a46f83d09f99... \n", - "5 becf01b49eb0599e980b569d669b422882eededdcf6d5e... \n", - "6 eb0c1e84bd7c176e5a00eaa7c03af4db330fa97a3e8e2b... \n", - "\n", " question \\\n", - "1 Who was responsible for designing the Eiffel T... \n", + "1 Who was the engineer behind the design of the ... \n", "3 What is the purpose of the Great Wall of China? \n", "4 What materials were used to build the Great Wa... \n", - "5 What direction does the Great Wall of China ge... \n", - "6 What is the Taj Mahal and who built it? \n", + "5 What is the general direction of the Great Wal... \n", + "6 What is the Taj Mahal primarily used for? \n", + "\n", + " _id \\\n", + "1 5b31740eab0e66fa435ac3b2d0f3ad299e9bc885da22ad... \n", + "3 2e40d9da383f39586c7f4a2e6cdc930de7ceaa1800d41c... \n", + "4 b05babced766cf6b65f43bc0d8c927d08a271d30423cd8... \n", + "5 88cd9adc26f148a24a1fbde7c5dfed1033db29c7ab997f... \n", + "6 4eaff3017898dab67377f19bef2cf7bbf7ee1223a661f7... \n", + "\n", + " question-id \\\n", + "1 985cd7b5de889c7b62eca2d45b83eac5c1ba6fa2dce681... \n", + "3 108ee53f98dcba40d2e4654df9e41dadf313000b2cfbb0... \n", + "4 a698316fccdb6facb8341372778863bb092fe71bc60357... \n", + "5 a5996280c5a2b382c206ab4fc69b8981f7588a6c216b63... \n", + "6 1e2f14820a6f599a5d124f5cd6b0e2575a0fa601a36d5f... \n", "\n", " answer score easiness_scores \n", - "1 The Eiffel Tower was designed by the engineer ... 1 0.570407 \n", + "1 The Eiffel Tower was designed by the engineer ... 1 0.569564 \n", "3 The purpose of the Great Wall of China is to p... 1 0.527854 \n", "4 The Great Wall of China was built using materi... 1 0.550470 \n", - "5 The Great Wall of China generally follows an e... 1 0.506733 \n", - "6 The Taj Mahal is an ivory-white marble mausole... 1 0.521735 " + "5 The Great Wall of China was generally built al... 1 0.462438 \n", + "6 The Taj Mahal is primarily used as a mausoleum... 1 0.444493 " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -598,7 +611,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -625,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -635,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -659,11 +672,11 @@ " \n", " \n", " \n", - " _id\n", " text\n", " title\n", - " question-id\n", " question\n", + " _id\n", + " question-id\n", " answer\n", " score\n", " easiness_scores\n", @@ -673,62 +686,62 @@ " \n", " \n", " 3\n", - " f6f3fece8d3797c71da36869fe0aabcf764f6ffa4694da...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " d29d16da19288242b409d2216d2d9b6aba261936fdd6fa...\n", - " What is the purpose of the Great Wall of China?\n", - " The purpose of the Great Wall of China is to p...\n", + " What materials were used to construct the Grea...\n", + " a5b2fd08b6a424a371b12c7d07c37044abddf168427dee...\n", + " e5078730ce04b2f8314fced830fbb037528097bfa4c9f8...\n", + " The Great Wall of China was constructed using ...\n", " 1\n", - " 0.527854\n", + " 0.553092\n", " {\\n\"criterion_1_explanation\": \"The question is...\n", " \n", " \n", " 4\n", - " b70486ff18e0a6870b167b5c23c24080c75687db219a05...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " 2cb46db5fd94ccc60980ff357a0f08292760a29f554c0a...\n", - " What materials were used to build the Great Wa...\n", - " The Great Wall of China was built using materi...\n", + " What was the primary purpose of building the G...\n", + " 51d260dd9881d4176553b1a416d3f299a375d903fc677b...\n", + " 4b5c0ad1ac49efb0f1d252792e4537c960566309f33eb0...\n", + " The primary purpose of building the Great Wall...\n", " 1\n", - " 0.550470\n", + " 0.505319\n", " {\\n\"criterion_1_explanation\": \"The question is...\n", " \n", " \n", " 5\n", - " d736edbfbde871f0064bef695a91dd663d62eacf4cff17...\n", " The Great Wall of China is a series of fortifi...\n", " The Great Wall of China - Ancient Protection\n", - " 44c7792e88cc656c440f0b9ffe3fd729dbdd05ee4594ee...\n", - " In which direction was the Great Wall of China...\n", + " Which direction was the Great Wall of China ge...\n", + " 5bf59b2efabd4a5b0d2f179841ff1cdc41086e6598a098...\n", + " 96456def817ac60dfa8a34f3983aba20209ec661d88535...\n", " The Great Wall of China was generally built al...\n", " 1\n", - " 0.553109\n", + " 0.545968\n", " {\\n\"criterion_1_explanation\": \"The question is...\n", " \n", " \n", " 6\n", - " bdfe150a3a62112c42a1145e98b5068e70303c28de303e...\n", " The Taj Mahal is an ivory-white marble mausole...\n", " Taj Mahal - A Symbol of Love\n", - " b0b0d6fbe55151b16bf4b8e6ce355ff9b9c9f07ba129dd...\n", - " What is the Taj Mahal primarily used for?\n", - " The Taj Mahal is primarily used as a mausoleum...\n", + " What is the Taj Mahal primarily made of?\n", + " 7cf289552442f65170be4c4d0a950a65b9d21ffeeca05d...\n", + " 53ea4d4ff9ba312d953c3a2a393bb503712f6036830aa8...\n", + " The Taj Mahal is primarily made of ivory-white...\n", " 1\n", - " 0.444493\n", + " 0.422271\n", " {\\n\"criterion_1_explanation\": \"The question is...\n", " \n", " \n", " 7\n", - " b5eee1a84b594db452d8a69ed00231a43f7b93f76e54e6...\n", " The Taj Mahal is an ivory-white marble mausole...\n", " Taj Mahal - A Symbol of Love\n", - " 83ef0cdb990227b9399ab24b0b3074c3842315b14f9ec4...\n", " Who commissioned the construction of the Taj M...\n", + " 3290bf8bb526a81774e70939849fd84a4ed49e708677ca...\n", + " 914f2cc013c20a7e34737435f17db7e05a38e8da333109...\n", " The Taj Mahal was commissioned by the Mughal e...\n", " 1\n", - " 0.555006\n", + " 0.547095\n", " {\\n\"criterion_1_explanation\": \"The question is...\n", " \n", " \n", @@ -736,13 +749,6 @@ "" ], "text/plain": [ - " _id \\\n", - "3 f6f3fece8d3797c71da36869fe0aabcf764f6ffa4694da... \n", - "4 b70486ff18e0a6870b167b5c23c24080c75687db219a05... \n", - "5 d736edbfbde871f0064bef695a91dd663d62eacf4cff17... \n", - "6 bdfe150a3a62112c42a1145e98b5068e70303c28de303e... \n", - "7 b5eee1a84b594db452d8a69ed00231a43f7b93f76e54e6... \n", - "\n", " text \\\n", "3 The Great Wall of China is a series of fortifi... \n", "4 The Great Wall of China is a series of fortifi... \n", @@ -757,26 +763,33 @@ "6 Taj Mahal - A Symbol of Love \n", "7 Taj Mahal - A Symbol of Love \n", "\n", - " question-id \\\n", - "3 d29d16da19288242b409d2216d2d9b6aba261936fdd6fa... \n", - "4 2cb46db5fd94ccc60980ff357a0f08292760a29f554c0a... \n", - "5 44c7792e88cc656c440f0b9ffe3fd729dbdd05ee4594ee... \n", - "6 b0b0d6fbe55151b16bf4b8e6ce355ff9b9c9f07ba129dd... \n", - "7 83ef0cdb990227b9399ab24b0b3074c3842315b14f9ec4... \n", - "\n", " question \\\n", - "3 What is the purpose of the Great Wall of China? \n", - "4 What materials were used to build the Great Wa... \n", - "5 In which direction was the Great Wall of China... \n", - "6 What is the Taj Mahal primarily used for? \n", + "3 What materials were used to construct the Grea... \n", + "4 What was the primary purpose of building the G... \n", + "5 Which direction was the Great Wall of China ge... \n", + "6 What is the Taj Mahal primarily made of? \n", "7 Who commissioned the construction of the Taj M... \n", "\n", + " _id \\\n", + "3 a5b2fd08b6a424a371b12c7d07c37044abddf168427dee... \n", + "4 51d260dd9881d4176553b1a416d3f299a375d903fc677b... \n", + "5 5bf59b2efabd4a5b0d2f179841ff1cdc41086e6598a098... \n", + "6 7cf289552442f65170be4c4d0a950a65b9d21ffeeca05d... \n", + "7 3290bf8bb526a81774e70939849fd84a4ed49e708677ca... \n", + "\n", + " question-id \\\n", + "3 e5078730ce04b2f8314fced830fbb037528097bfa4c9f8... \n", + "4 4b5c0ad1ac49efb0f1d252792e4537c960566309f33eb0... \n", + "5 96456def817ac60dfa8a34f3983aba20209ec661d88535... \n", + "6 53ea4d4ff9ba312d953c3a2a393bb503712f6036830aa8... \n", + "7 914f2cc013c20a7e34737435f17db7e05a38e8da333109... \n", + "\n", " answer score easiness_scores \\\n", - "3 The purpose of the Great Wall of China is to p... 1 0.527854 \n", - "4 The Great Wall of China was built using materi... 1 0.550470 \n", - "5 The Great Wall of China was generally built al... 1 0.553109 \n", - "6 The Taj Mahal is primarily used as a mausoleum... 1 0.444493 \n", - "7 The Taj Mahal was commissioned by the Mughal e... 1 0.555006 \n", + "3 The Great Wall of China was constructed using ... 1 0.553092 \n", + "4 The primary purpose of building the Great Wall... 1 0.505319 \n", + "5 The Great Wall of China was generally built al... 1 0.545968 \n", + "6 The Taj Mahal is primarily made of ivory-white... 1 0.422271 \n", + "7 The Taj Mahal was commissioned by the Mughal e... 1 0.547095 \n", "\n", " answerability_scores \n", "3 {\\n\"criterion_1_explanation\": \"The question is... \n", @@ -786,7 +799,7 @@ "7 {\\n\"criterion_1_explanation\": \"The question is... " ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -797,14 +810,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Total number of data points after application of answerability filter = 18\n" + "Total number of data points after application of answerability filter = 19\n" ] } ], diff --git a/tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py b/tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py index 6bb332dd8..096cec012 100644 --- a/tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py +++ b/tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py @@ -36,10 +36,14 @@ from nemo_curator import AsyncOpenAIClient, OpenAIClient from nemo_curator.datasets import DocumentDataset from nemo_curator.filters.doc_filter import DocumentFilter -from nemo_curator.modules.config import RetrieverEvalSDGConfig from nemo_curator.synthetic import AsyncNemotronGenerator, NemotronGenerator from nemo_curator.synthetic.generator import SyntheticDataGenerator +config = importlib.import_module( + "tutorials.nemo-retriever-synthetic-data-generation.config.config" +) +RetrieverEvalSDGConfig = config.RetrieverEvalSDGConfig + # ----------------------------------------------------------------------------80 class RetrieverEvalSetGenerator(SyntheticDataGenerator): @@ -74,14 +78,6 @@ def _init_pipeline_params(self): self.client = OpenAIClient(self.openai_client) self.generator = NemotronGenerator(self.client) - # TODO asynchronous - self.async_openai_client = AsyncOpenAI( - base_url=self.cfg.base_url, - api_key=self.cfg.api_key, - ) - self.async_client = AsyncOpenAIClient(self.async_openai_client) - self.async_generator = AsyncNemotronGenerator(self.async_client) - if self._validate_config(): self.sys_prompt = self.cfg.generator_system_prompt self.user_prompt_template = self.cfg.generator_user_prompt_template