Allow setting up a name for experiment and the job

microsoft · Mar 17, 2024 · 2b076b3 · 2b076b3
1 parent 3a063cb
commit 2b076b3
Show file tree

Hide file tree

Showing 13 changed files with 81 additions and 30 deletions.
diff --git a/.env.template b/.env.template
@@ -27,6 +27,12 @@ AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY=
 # OPTIONAL
 ############
 
+EXPERIMENT_NAME= # Optional, if not provided, the prefix of the seach index will be used from the config.json file
+
+# Azure ML Job name and description, it is useful to set the same name for the experiment and for each run set unique names, that will group all results in the save dashboard inside AzureML
+JOB_NAME=
+JOB_DESCRIPTION=
+
 #### Azure Search Skillsets
 AZURE_LANGUAGE_SERVICE_ENDPOINT=
 AZURE_LANGUAGE_SERVICE_KEY=

diff --git a/README.md b/README.md
@@ -179,7 +179,7 @@ To use the **RAG Experiment Accelerator**, follow these steps:
 
 ```json
 {
-    "name_prefix": "Name of experiment, search index name used for tracking and comparing jobs",
+    "index_name_prefix": "Search index name prefix used for tracking and comparing jobs",
     "chunking": {
         "chunk_size": "Size of each chunk e.g. [500, 1000, 2000]" ,
         "overlap_size": "Overlap Size for each chunk e.g. [100, 200, 300]"
@@ -193,6 +193,7 @@ To use the **RAG Experiment Accelerator**, follow these steps:
         "index_analyzer_name" : "name of the analyzer used at indexing time for the field. This option can be used only with searchable fields. It must be set together with searchAnalyzer and it cannot be set together with the analyzer option.",
         "search_analyzer_name" : "name of the analyzer used at search time for the field. This option can be used only with searchable fields. It must be set together with indexAnalyzer and it cannot be set together with the analyzer option. This property cannot be set to the name of a language analyzer; use the analyzer property instead if you need a language analyzer.",
     },
+    "experiment_name": "name of the experiment",
     "rerank": "determines if search results should be re-ranked. Value values are TRUE or FALSE" ,
     "rerank_type": "determines the type of re-ranking. Value values are llm or crossencoder",
     "llm_re_rank_threshold": "determines the threshold when using llm re-ranking. Chunks with rank above this number are selected in range from 1 - 10." ,

diff --git a/config.json b/config.json
@@ -1,5 +1,8 @@
 {
-    "name_prefix": "surface",
+    "index_name_prefix": "surface",
+    "experiment_name": "surface",
+    "job_name": "baseline",
+    "job_description": "",
     "chunking": {
         "chunk_size": [1000],
         "overlap_size": [200]

diff --git a/promptflow/rag-experiment-accelerator/README.md b/promptflow/rag-experiment-accelerator/README.md
@@ -12,7 +12,7 @@ The `setup` node runs first and loads the required environment variables from a
 
 ### Index
 The `index` node will:
-- Create indexes based on the parameters set in `config.json`. Each index name will be in the following format: `{name_prefix}-{chunk_size}-{overlap}-{dimension}-{ef_construction}-{ef_search}`
+- Create indexes based on the parameters set in `config.json`. Each index name will be in the following format: `{index_name_prefix}-{chunk_size}-{overlap}-{dimension}-{ef_construction}-{ef_search}`
 - Chunk documents based on the chunking parameters in `config.json`
 - Generate a summary and title for each chunk
 - Create embeddings for each chunk's content, generated title, and generated summary
@@ -96,7 +96,7 @@ az ml environment create --file ./environment.yaml -w $MLWorkSpaceName
 
 ```json
 {
-    "name_prefix": "Name of experiment, search index name used for tracking and comparing jobs",
+    "index_name_prefix": "Search index name prefix used for tracking and comparing jobs",
     "chunking": {
         "chunk_size": "Size of each chunk e.g. [500, 1000, 2000]" ,
         "overlap_size": "Overlap Size for each chunk e.g. [100, 200, 300]" 
@@ -109,6 +109,7 @@ az ml environment create --file ./environment.yaml -w $MLWorkSpaceName
         "index_analyzer_name" : "name of the analyzer used at indexing time for the field. This option can be used only with searchable fields. It must be set together with searchAnalyzer and it cannot be set together with the analyzer option.",
         "search_analyzer_name" : "name of the analyzer used at search time for the field. This option can be used only with searchable fields. It must be set together with indexAnalyzer and it cannot be set together with the analyzer option. This property cannot be set to the name of a language analyzer; use the analyzer property instead if you need a language analyzer.",
     },
+    "experiment_name": "name of the experiment",
     "rerank": "determines if search results should be re-ranked. Value values are TRUE or FALSE" ,
     "rerank_type": "determines the type of re-ranking. Value values are llm or crossencoder", 
     "llm_re_rank_threshold": "determines the threshold when using llm re-ranking. Chunks with rank above this number are selected in range from 1 - 10." ,

diff --git a/rag_experiment_accelerator/artifact/handlers/query_output_handler.py b/rag_experiment_accelerator/artifact/handlers/query_output_handler.py
@@ -25,7 +25,9 @@ def __init__(
         """
         super().__init__(data_location=data_location, writer=writer, loader=loader)
 
-    def _get_output_name(self, index_name: str) -> str:
+    def _get_output_name(
+        self, index_name: str, experiment_name: str, job_name: str
+    ) -> str:
         """
         Returns the output name for a given index name.
 
@@ -35,7 +37,7 @@ def _get_output_name(self, index_name: str) -> str:
         Returns:
             str: The output name.
         """
-        return f"eval_output_{index_name}.jsonl"
+        return f"eval_output_{index_name}_{experiment_name}_{job_name}.jsonl"
 
     def get_output_path(self, index_name: str) -> str:
         """
@@ -49,7 +51,9 @@ def get_output_path(self, index_name: str) -> str:
         """
         return f"{self.data_location}/{self._get_output_name(index_name)}"
 
-    def load(self, index_name: str) -> list[QueryOutput]:
+    def load(
+        self, index_name: str, experiment_name: str, job_name: str
+    ) -> list[QueryOutput]:
         """
         Loads the query outputs for a given index name.
 
@@ -59,7 +63,7 @@ def load(self, index_name: str) -> list[QueryOutput]:
         Returns:
             list[QueryOutput]: The loaded query outputs.
         """
-        output_name = self._get_output_name(index_name)
+        output_name = self._get_output_name(index_name, experiment_name, job_name)
 
         query_outputs = []
         data_load = super().load(output_name)
@@ -72,7 +76,9 @@ def load(self, index_name: str) -> list[QueryOutput]:
 
         return query_outputs
 
-    def handle_archive_by_index(self, index_name: str) -> str | None:
+    def handle_archive_by_index(
+        self, index_name: str, experiment_name: str, job_name: str
+    ) -> str | None:
         """
         Handles archiving of query output for a given index name.
 
@@ -82,16 +88,18 @@ def handle_archive_by_index(self, index_name: str) -> str | None:
         Returns:
             str | None: The output filename if successful, None otherwise.
         """
-        output_filename = self._get_output_name(index_name)
+        output_filename = self._get_output_name(index_name, experiment_name, job_name)
         return self.handle_archive(output_filename)
 
-    def save(self, data: QueryOutput, index_name: str):
+    def save(
+        self, data: QueryOutput, index_name: str, experiment_name: str, job_name: str
+    ):
         """
         Saves the query output for a given index name.
 
         Args:
             data (QueryOutput): The query output to be saved.
             index_name (str): The name of the index.
         """
-        output_filename = self._get_output_name(index_name)
+        output_filename = self._get_output_name(index_name, experiment_name, job_name)
         self.save_dict(data.__dict__, output_filename)
diff --git a/rag_experiment_accelerator/config/config.py b/rag_experiment_accelerator/config/config.py
@@ -29,7 +29,10 @@ class Config:
         EMBEDDING_DIMENSIONS (list[int]): The number of dimensions to use for document embeddings.
         EF_CONSTRUCTIONS (list[int]): The number of ef_construction to use for HNSW index.
         EF_SEARCHES (list[int]): The number of ef_search to use for HNSW index.
-        NAME_PREFIX (str): A prefix to use for the names of saved models.
+        INDEX_NAME_PREFIX (str): A prefix to use for the names of saved models.
+        EXPERIMENT_NAME (str): The name of the experiment in Azure ML (optional, if not set INDEX_NAME_PREFIX will be used).
+        JOB_NAME (str): The name of the job in Azure ML (optional, if not set EXPERIMENT_NAME and current datetime will be used).
+        JOB_DESCRIPTION (str): The description of the job in Azure ML (optional).
         SEARCH_VARIANTS (list[str]): A list of search types to use.
         AZURE_OAI_CHAT_DEPLOYMENT_NAME (str): The name of the Azure deployment to use.
         AZURE_OAI_EVAL_DEPLOYMENT_NAME (str): The name of the deployment to use for evaluation.
@@ -100,7 +103,10 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:
         self.OVERLAP_SIZES = data["chunking"]["overlap_size"]
         self.EF_CONSTRUCTIONS = data["ef_construction"]
         self.EF_SEARCHES = data["ef_search"]
-        self.NAME_PREFIX = data["name_prefix"]
+        self.INDEX_NAME_PREFIX = data["index_name_prefix"]
+        self.EXPERIMENT_NAME = data["experiment_name"] or self.INDEX_NAME_PREFIX
+        self.JOB_NAME = data["job_name"]
+        self.JOB_DESCRIPTION = data["job_description"]
         self.SEARCH_VARIANTS = data["search_types"]
         self.AZURE_OAI_CHAT_DEPLOYMENT_NAME = data.get(
             "azure_oai_chat_deployment_name", None
@@ -124,7 +130,9 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:
         self.AzureSearchCredentials = AzureSearchCredentials.from_env()
         self.AzureMLCredentials = AzureMLCredentials.from_env()
         self.AzureSkillsCredentials = AzureSkillsCredentials.from_env()
-        self.AzureDocumentIntelligenceCredentials = AzureDocumentIntelligenceCredentials.from_env()
+        self.AzureDocumentIntelligenceCredentials = (
+            AzureDocumentIntelligenceCredentials.from_env()
+        )
 
         self.embedding_models: list[EmbeddingModel] = []
         embedding_model_config = data.get("embedding_models", [])

diff --git a/rag_experiment_accelerator/config/tests/data/config.json b/rag_experiment_accelerator/config/tests/data/config.json
@@ -1,5 +1,8 @@
 {
-    "name_prefix": "test_prefix",
+    "index_name_prefix": "test_prefix",
+    "experiment_name": "experiment_1",
+    "job_name": "baseline",
+    "job_description": "",
     "chunking": {
         "chunk_size": [512],
         "overlap_size": [128]

diff --git a/rag_experiment_accelerator/config/tests/test_config.py b/rag_experiment_accelerator/config/tests/test_config.py
@@ -55,7 +55,8 @@ def test_config_init(mock_embedding_model_factory):
 
     config.embedding_models = [embedding_model_1, embedding_model_2]
 
-    assert config.NAME_PREFIX == mock_config_data["name_prefix"]
+    assert config.INDEX_NAME_PREFIX == mock_config_data["index_name_prefix"]
+    assert config.EXPERIMENT_NAME == mock_config_data["experiment_name"]
     assert config.CHUNK_SIZES == mock_config_data["chunking"]["chunk_size"]
     assert config.OVERLAP_SIZES == mock_config_data["chunking"]["overlap_size"]
     assert config.CHUNKING_STRATEGY == mock_config_data["chunking_strategy"]

diff --git a/rag_experiment_accelerator/evaluation/eval.py b/rag_experiment_accelerator/evaluation/eval.py
@@ -566,6 +566,8 @@ def compute_metrics(question, actual, expected, context, metric_type):
 
 def evaluate_prompts(
     exp_name: str,
+    job_name,
+    job_description,
     index_name: str,
     config: Config,
     client: mlflow.MlflowClient,
@@ -607,9 +609,13 @@ def evaluate_prompts(
     metric_types = config.METRIC_TYPES
     num_search_type = config.SEARCH_VARIANTS
     data_list = []
-    run_name = f"{exp_name}_{formatted_datetime}"
+    run_name = (
+        job_name
+        if (job_name is not None) and (job_name != "")
+        else f"{exp_name}_{formatted_datetime}"
+    )
     mlflow.set_experiment(exp_name)
-    mlflow.start_run(run_name=run_name)
+    mlflow.start_run(run_name=run_name, description=job_description)
     pd.set_option("display.max_columns", None)
 
     run_id = mlflow.active_run().info.run_id
@@ -619,7 +625,7 @@ def evaluate_prompts(
     average_precision_for_search_type = {}
 
     handler = QueryOutputHandler(config.QUERY_DATA_LOCATION)
-    query_data_load = handler.load(index_name)
+    query_data_load = handler.load(index_name, config.EXPERIMENT_NAME, config.JOB_NAME)
     for data in query_data_load:
         actual = remove_spaces(lower(data.actual))
         expected = remove_spaces(lower(data.expected))

diff --git a/rag_experiment_accelerator/run/evaluation.py b/rag_experiment_accelerator/run/evaluation.py
@@ -45,7 +45,7 @@ def run(config_dir: str, filename: str = "config.json"):
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,
@@ -55,7 +55,9 @@ def run(config_dir: str, filename: str = "config.json"):
                         logger.info(f"Evaluating Index: {index_name}")
 
                         eval.evaluate_prompts(
-                            exp_name=config.NAME_PREFIX,
+                            exp_name=config.EXPERIMENT_NAME,
+                            job_name=config.JOB_NAME,
+                            job_description=config.JOB_DESCRIPTION,
                             index_name=index_name,
                             config=config,
                             client=client,

diff --git a/rag_experiment_accelerator/run/index.py b/rag_experiment_accelerator/run/index.py
@@ -50,7 +50,7 @@ def run(config_dir: str, data_dir: str = "data", filename: str = "config.json")
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,
@@ -76,13 +76,18 @@ def run(config_dir: str, data_dir: str = "data", filename: str = "config.json")
     for chunk_size in config.CHUNK_SIZES:
         for overlap in config.OVERLAP_SIZES:
             all_docs = load_documents(
-                config.CHUNKING_STRATEGY, config.AzureDocumentIntelligenceCredentials, config.DATA_FORMATS, config.data_dir, chunk_size, overlap
+                config.CHUNKING_STRATEGY,
+                config.AzureDocumentIntelligenceCredentials,
+                config.DATA_FORMATS,
+                config.data_dir,
+                chunk_size,
+                overlap,
             )
             for embedding_model in config.embedding_models:
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,

diff --git a/rag_experiment_accelerator/run/querying.py b/rag_experiment_accelerator/run/querying.py
@@ -268,7 +268,7 @@ def run(config_dir: str, filename: str = "config.json"):
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,
@@ -277,7 +277,9 @@ def run(config_dir: str, filename: str = "config.json"):
                         )
                         logger.info(f"Index: {index_name}")
 
-                        handler.handle_archive_by_index(index_name)
+                        handler.handle_archive_by_index(
+                            index_name, config.EXPERIMENT_NAME, config.JOB_NAME
+                        )
 
                         search_client = create_client(
                             service_endpoint, index_name, search_admin_key
@@ -394,7 +396,12 @@ def run(config_dir: str, filename: str = "config.json"):
                                             context=qna_context,
                                             question=user_prompt,
                                         )
-                                        handler.save(index_name=index_name, data=output)
+                                        handler.save(
+                                            index_name=index_name,
+                                            data=output,
+                                            experiment_name=config.EXPERIMENT_NAME,
+                                            job_name=config.JOB_NAME,
+                                        )
 
                                 except BadRequestError as e:
                                     logger.error(

diff --git a/rag_experiment_accelerator/run/tests/test_querying.py b/rag_experiment_accelerator/run/tests/test_querying.py
@@ -23,7 +23,7 @@ def setUp(self):
         self.mock_config.EF_CONSTRUCTIONS = [400]
         self.mock_config.EF_SEARCHES = [400]
         self.mock_config.SEARCH_VARIANTS = ["search_for_match_semantic"]
-        self.mock_config.NAME_PREFIX = "prefix"
+        self.mock_config.INDEX_NAME_PREFIX = "prefix"
         self.mock_config.RERANK_TYPE = "llm"
         self.mock_config.CHUNK_SIZES = [1]
         self.mock_config.OVERLAP_SIZES = [1]
@@ -277,7 +277,7 @@ def test_run_no_multi_no_rerank(
         mock_config.return_value.EF_CONSTRUCTIONS = [400]
         mock_config.return_value.EF_SEARCHES = [400]
         mock_config.return_value.SEARCH_VARIANTS = ["search_for_match_semantic"]
-        mock_config.return_value.NAME_PREFIX = "prefix"
+        mock_config.return_value.INDEX_NAME_PREFIX = "prefix"
         mock_config.return_value.RERANK = False
         mock_do_we_need_multiple_questions.return_value = False
         mock_query_and_eval_acs.return_value = [MagicMock(), MagicMock()]