From 2b076b39b8372f709e8d0acfe7895a8780f884f4 Mon Sep 17 00:00:00 2001
From: Liza Shakury <Liza.Shakury@microsoft.com>
Date: Sun, 17 Mar 2024 13:14:06 +0000
Subject: [PATCH] Allow setting up a name for experiment and the job

---
 .env.template                                 |  6 +++++
 README.md                                     |  3 ++-
 config.json                                   |  5 +++-
 .../rag-experiment-accelerator/README.md      |  5 ++--
 .../artifact/handlers/query_output_handler.py | 24 ++++++++++++-------
 rag_experiment_accelerator/config/config.py   | 14 ++++++++---
 .../config/tests/data/config.json             |  5 +++-
 .../config/tests/test_config.py               |  3 ++-
 rag_experiment_accelerator/evaluation/eval.py | 12 +++++++---
 rag_experiment_accelerator/run/evaluation.py  |  6 +++--
 rag_experiment_accelerator/run/index.py       | 11 ++++++---
 rag_experiment_accelerator/run/querying.py    | 13 +++++++---
 .../run/tests/test_querying.py                |  4 ++--
 13 files changed, 81 insertions(+), 30 deletions(-)

diff --git a/.env.template b/.env.template
index 0aaa8a09..97d11bde 100644
--- a/.env.template
+++ b/.env.template
@@ -27,6 +27,12 @@ AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY=
 # OPTIONAL
 ############
 
+EXPERIMENT_NAME= # Optional, if not provided, the prefix of the seach index will be used from the config.json file
+
+# Azure ML Job name and description, it is useful to set the same name for the experiment and for each run set unique names, that will group all results in the save dashboard inside AzureML
+JOB_NAME=
+JOB_DESCRIPTION=
+
 #### Azure Search Skillsets
 AZURE_LANGUAGE_SERVICE_ENDPOINT=
 AZURE_LANGUAGE_SERVICE_KEY=
diff --git a/README.md b/README.md
index 8260de81..4774c9dd 100644
--- a/README.md
+++ b/README.md
@@ -179,7 +179,7 @@ To use the **RAG Experiment Accelerator**, follow these steps:
 
 ```json
 {
-    "name_prefix": "Name of experiment, search index name used for tracking and comparing jobs",
+    "index_name_prefix": "Search index name prefix used for tracking and comparing jobs",
     "chunking": {
         "chunk_size": "Size of each chunk e.g. [500, 1000, 2000]" ,
         "overlap_size": "Overlap Size for each chunk e.g. [100, 200, 300]"
@@ -193,6 +193,7 @@ To use the **RAG Experiment Accelerator**, follow these steps:
         "index_analyzer_name" : "name of the analyzer used at indexing time for the field. This option can be used only with searchable fields. It must be set together with searchAnalyzer and it cannot be set together with the analyzer option.",
         "search_analyzer_name" : "name of the analyzer used at search time for the field. This option can be used only with searchable fields. It must be set together with indexAnalyzer and it cannot be set together with the analyzer option. This property cannot be set to the name of a language analyzer; use the analyzer property instead if you need a language analyzer.",
     },
+    "experiment_name": "name of the experiment",
     "rerank": "determines if search results should be re-ranked. Value values are TRUE or FALSE" ,
     "rerank_type": "determines the type of re-ranking. Value values are llm or crossencoder",
     "llm_re_rank_threshold": "determines the threshold when using llm re-ranking. Chunks with rank above this number are selected in range from 1 - 10." ,
diff --git a/config.json b/config.json
index 31e4f013..60bb8676 100644
--- a/config.json
+++ b/config.json
@@ -1,5 +1,8 @@
 {
-    "name_prefix": "surface",
+    "index_name_prefix": "surface",
+    "experiment_name": "surface",
+    "job_name": "baseline",
+    "job_description": "",
     "chunking": {
         "chunk_size": [1000],
         "overlap_size": [200]
diff --git a/promptflow/rag-experiment-accelerator/README.md b/promptflow/rag-experiment-accelerator/README.md
index 63231ba8..95232526 100644
--- a/promptflow/rag-experiment-accelerator/README.md
+++ b/promptflow/rag-experiment-accelerator/README.md
@@ -12,7 +12,7 @@ The `setup` node runs first and loads the required environment variables from a
 
 ### Index
 The `index` node will:
-- Create indexes based on the parameters set in `config.json`. Each index name will be in the following format: `{name_prefix}-{chunk_size}-{overlap}-{dimension}-{ef_construction}-{ef_search}`
+- Create indexes based on the parameters set in `config.json`. Each index name will be in the following format: `{index_name_prefix}-{chunk_size}-{overlap}-{dimension}-{ef_construction}-{ef_search}`
 - Chunk documents based on the chunking parameters in `config.json`
 - Generate a summary and title for each chunk
 - Create embeddings for each chunk's content, generated title, and generated summary
@@ -96,7 +96,7 @@ az ml environment create --file ./environment.yaml -w $MLWorkSpaceName
 
 ```json
 {
-    "name_prefix": "Name of experiment, search index name used for tracking and comparing jobs",
+    "index_name_prefix": "Search index name prefix used for tracking and comparing jobs",
     "chunking": {
         "chunk_size": "Size of each chunk e.g. [500, 1000, 2000]" ,
         "overlap_size": "Overlap Size for each chunk e.g. [100, 200, 300]" 
@@ -109,6 +109,7 @@ az ml environment create --file ./environment.yaml -w $MLWorkSpaceName
         "index_analyzer_name" : "name of the analyzer used at indexing time for the field. This option can be used only with searchable fields. It must be set together with searchAnalyzer and it cannot be set together with the analyzer option.",
         "search_analyzer_name" : "name of the analyzer used at search time for the field. This option can be used only with searchable fields. It must be set together with indexAnalyzer and it cannot be set together with the analyzer option. This property cannot be set to the name of a language analyzer; use the analyzer property instead if you need a language analyzer.",
     },
+    "experiment_name": "name of the experiment",
     "rerank": "determines if search results should be re-ranked. Value values are TRUE or FALSE" ,
     "rerank_type": "determines the type of re-ranking. Value values are llm or crossencoder", 
     "llm_re_rank_threshold": "determines the threshold when using llm re-ranking. Chunks with rank above this number are selected in range from 1 - 10." ,
diff --git a/rag_experiment_accelerator/artifact/handlers/query_output_handler.py b/rag_experiment_accelerator/artifact/handlers/query_output_handler.py
index 58e79281..8ff17657 100644
--- a/rag_experiment_accelerator/artifact/handlers/query_output_handler.py
+++ b/rag_experiment_accelerator/artifact/handlers/query_output_handler.py
@@ -25,7 +25,9 @@ def __init__(
         """
         super().__init__(data_location=data_location, writer=writer, loader=loader)
 
-    def _get_output_name(self, index_name: str) -> str:
+    def _get_output_name(
+        self, index_name: str, experiment_name: str, job_name: str
+    ) -> str:
         """
         Returns the output name for a given index name.
 
@@ -35,7 +37,7 @@ def _get_output_name(self, index_name: str) -> str:
         Returns:
             str: The output name.
         """
-        return f"eval_output_{index_name}.jsonl"
+        return f"eval_output_{index_name}_{experiment_name}_{job_name}.jsonl"
 
     def get_output_path(self, index_name: str) -> str:
         """
@@ -49,7 +51,9 @@ def get_output_path(self, index_name: str) -> str:
         """
         return f"{self.data_location}/{self._get_output_name(index_name)}"
 
-    def load(self, index_name: str) -> list[QueryOutput]:
+    def load(
+        self, index_name: str, experiment_name: str, job_name: str
+    ) -> list[QueryOutput]:
         """
         Loads the query outputs for a given index name.
 
@@ -59,7 +63,7 @@ def load(self, index_name: str) -> list[QueryOutput]:
         Returns:
             list[QueryOutput]: The loaded query outputs.
         """
-        output_name = self._get_output_name(index_name)
+        output_name = self._get_output_name(index_name, experiment_name, job_name)
 
         query_outputs = []
         data_load = super().load(output_name)
@@ -72,7 +76,9 @@ def load(self, index_name: str) -> list[QueryOutput]:
 
         return query_outputs
 
-    def handle_archive_by_index(self, index_name: str) -> str | None:
+    def handle_archive_by_index(
+        self, index_name: str, experiment_name: str, job_name: str
+    ) -> str | None:
         """
         Handles archiving of query output for a given index name.
 
@@ -82,10 +88,12 @@ def handle_archive_by_index(self, index_name: str) -> str | None:
         Returns:
             str | None: The output filename if successful, None otherwise.
         """
-        output_filename = self._get_output_name(index_name)
+        output_filename = self._get_output_name(index_name, experiment_name, job_name)
         return self.handle_archive(output_filename)
 
-    def save(self, data: QueryOutput, index_name: str):
+    def save(
+        self, data: QueryOutput, index_name: str, experiment_name: str, job_name: str
+    ):
         """
         Saves the query output for a given index name.
 
@@ -93,5 +101,5 @@ def save(self, data: QueryOutput, index_name: str):
             data (QueryOutput): The query output to be saved.
             index_name (str): The name of the index.
         """
-        output_filename = self._get_output_name(index_name)
+        output_filename = self._get_output_name(index_name, experiment_name, job_name)
         self.save_dict(data.__dict__, output_filename)
diff --git a/rag_experiment_accelerator/config/config.py b/rag_experiment_accelerator/config/config.py
index 6404a6d3..f8ccde0c 100644
--- a/rag_experiment_accelerator/config/config.py
+++ b/rag_experiment_accelerator/config/config.py
@@ -29,7 +29,10 @@ class Config:
         EMBEDDING_DIMENSIONS (list[int]): The number of dimensions to use for document embeddings.
         EF_CONSTRUCTIONS (list[int]): The number of ef_construction to use for HNSW index.
         EF_SEARCHES (list[int]): The number of ef_search to use for HNSW index.
-        NAME_PREFIX (str): A prefix to use for the names of saved models.
+        INDEX_NAME_PREFIX (str): A prefix to use for the names of saved models.
+        EXPERIMENT_NAME (str): The name of the experiment in Azure ML (optional, if not set INDEX_NAME_PREFIX will be used).
+        JOB_NAME (str): The name of the job in Azure ML (optional, if not set EXPERIMENT_NAME and current datetime will be used).
+        JOB_DESCRIPTION (str): The description of the job in Azure ML (optional).
         SEARCH_VARIANTS (list[str]): A list of search types to use.
         AZURE_OAI_CHAT_DEPLOYMENT_NAME (str): The name of the Azure deployment to use.
         AZURE_OAI_EVAL_DEPLOYMENT_NAME (str): The name of the deployment to use for evaluation.
@@ -100,7 +103,10 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:
         self.OVERLAP_SIZES = data["chunking"]["overlap_size"]
         self.EF_CONSTRUCTIONS = data["ef_construction"]
         self.EF_SEARCHES = data["ef_search"]
-        self.NAME_PREFIX = data["name_prefix"]
+        self.INDEX_NAME_PREFIX = data["index_name_prefix"]
+        self.EXPERIMENT_NAME = data["experiment_name"] or self.INDEX_NAME_PREFIX
+        self.JOB_NAME = data["job_name"]
+        self.JOB_DESCRIPTION = data["job_description"]
         self.SEARCH_VARIANTS = data["search_types"]
         self.AZURE_OAI_CHAT_DEPLOYMENT_NAME = data.get(
             "azure_oai_chat_deployment_name", None
@@ -124,7 +130,9 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:
         self.AzureSearchCredentials = AzureSearchCredentials.from_env()
         self.AzureMLCredentials = AzureMLCredentials.from_env()
         self.AzureSkillsCredentials = AzureSkillsCredentials.from_env()
-        self.AzureDocumentIntelligenceCredentials = AzureDocumentIntelligenceCredentials.from_env()
+        self.AzureDocumentIntelligenceCredentials = (
+            AzureDocumentIntelligenceCredentials.from_env()
+        )
 
         self.embedding_models: list[EmbeddingModel] = []
         embedding_model_config = data.get("embedding_models", [])
diff --git a/rag_experiment_accelerator/config/tests/data/config.json b/rag_experiment_accelerator/config/tests/data/config.json
index c6dcdc56..10c79575 100644
--- a/rag_experiment_accelerator/config/tests/data/config.json
+++ b/rag_experiment_accelerator/config/tests/data/config.json
@@ -1,5 +1,8 @@
 {
-    "name_prefix": "test_prefix",
+    "index_name_prefix": "test_prefix",
+    "experiment_name": "experiment_1",
+    "job_name": "baseline",
+    "job_description": "",
     "chunking": {
         "chunk_size": [512],
         "overlap_size": [128]
diff --git a/rag_experiment_accelerator/config/tests/test_config.py b/rag_experiment_accelerator/config/tests/test_config.py
index c1951aca..bf73d949 100644
--- a/rag_experiment_accelerator/config/tests/test_config.py
+++ b/rag_experiment_accelerator/config/tests/test_config.py
@@ -55,7 +55,8 @@ def test_config_init(mock_embedding_model_factory):
 
     config.embedding_models = [embedding_model_1, embedding_model_2]
 
-    assert config.NAME_PREFIX == mock_config_data["name_prefix"]
+    assert config.INDEX_NAME_PREFIX == mock_config_data["index_name_prefix"]
+    assert config.EXPERIMENT_NAME == mock_config_data["experiment_name"]
     assert config.CHUNK_SIZES == mock_config_data["chunking"]["chunk_size"]
     assert config.OVERLAP_SIZES == mock_config_data["chunking"]["overlap_size"]
     assert config.CHUNKING_STRATEGY == mock_config_data["chunking_strategy"]
diff --git a/rag_experiment_accelerator/evaluation/eval.py b/rag_experiment_accelerator/evaluation/eval.py
index 49bd09c2..4000edbc 100644
--- a/rag_experiment_accelerator/evaluation/eval.py
+++ b/rag_experiment_accelerator/evaluation/eval.py
@@ -566,6 +566,8 @@ def compute_metrics(question, actual, expected, context, metric_type):
 
 def evaluate_prompts(
     exp_name: str,
+    job_name,
+    job_description,
     index_name: str,
     config: Config,
     client: mlflow.MlflowClient,
@@ -607,9 +609,13 @@ def evaluate_prompts(
     metric_types = config.METRIC_TYPES
     num_search_type = config.SEARCH_VARIANTS
     data_list = []
-    run_name = f"{exp_name}_{formatted_datetime}"
+    run_name = (
+        job_name
+        if (job_name is not None) and (job_name != "")
+        else f"{exp_name}_{formatted_datetime}"
+    )
     mlflow.set_experiment(exp_name)
-    mlflow.start_run(run_name=run_name)
+    mlflow.start_run(run_name=run_name, description=job_description)
     pd.set_option("display.max_columns", None)
 
     run_id = mlflow.active_run().info.run_id
@@ -619,7 +625,7 @@ def evaluate_prompts(
     average_precision_for_search_type = {}
 
     handler = QueryOutputHandler(config.QUERY_DATA_LOCATION)
-    query_data_load = handler.load(index_name)
+    query_data_load = handler.load(index_name, config.EXPERIMENT_NAME, config.JOB_NAME)
     for data in query_data_load:
         actual = remove_spaces(lower(data.actual))
         expected = remove_spaces(lower(data.expected))
diff --git a/rag_experiment_accelerator/run/evaluation.py b/rag_experiment_accelerator/run/evaluation.py
index f516a85b..ad070fc9 100644
--- a/rag_experiment_accelerator/run/evaluation.py
+++ b/rag_experiment_accelerator/run/evaluation.py
@@ -45,7 +45,7 @@ def run(config_dir: str, filename: str = "config.json"):
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,
@@ -55,7 +55,9 @@ def run(config_dir: str, filename: str = "config.json"):
                         logger.info(f"Evaluating Index: {index_name}")
 
                         eval.evaluate_prompts(
-                            exp_name=config.NAME_PREFIX,
+                            exp_name=config.EXPERIMENT_NAME,
+                            job_name=config.JOB_NAME,
+                            job_description=config.JOB_DESCRIPTION,
                             index_name=index_name,
                             config=config,
                             client=client,
diff --git a/rag_experiment_accelerator/run/index.py b/rag_experiment_accelerator/run/index.py
index bfbe3c9c..92518a4b 100644
--- a/rag_experiment_accelerator/run/index.py
+++ b/rag_experiment_accelerator/run/index.py
@@ -50,7 +50,7 @@ def run(config_dir: str, data_dir: str = "data", filename: str = "config.json")
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,
@@ -76,13 +76,18 @@ def run(config_dir: str, data_dir: str = "data", filename: str = "config.json")
     for chunk_size in config.CHUNK_SIZES:
         for overlap in config.OVERLAP_SIZES:
             all_docs = load_documents(
-                config.CHUNKING_STRATEGY, config.AzureDocumentIntelligenceCredentials, config.DATA_FORMATS, config.data_dir, chunk_size, overlap
+                config.CHUNKING_STRATEGY,
+                config.AzureDocumentIntelligenceCredentials,
+                config.DATA_FORMATS,
+                config.data_dir,
+                chunk_size,
+                overlap,
             )
             for embedding_model in config.embedding_models:
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,
diff --git a/rag_experiment_accelerator/run/querying.py b/rag_experiment_accelerator/run/querying.py
index f2986636..0a703acb 100644
--- a/rag_experiment_accelerator/run/querying.py
+++ b/rag_experiment_accelerator/run/querying.py
@@ -268,7 +268,7 @@ def run(config_dir: str, filename: str = "config.json"):
                 for ef_construction in config.EF_CONSTRUCTIONS:
                     for ef_search in config.EF_SEARCHES:
                         index_name = get_index_name(
-                            config.NAME_PREFIX,
+                            config.INDEX_NAME_PREFIX,
                             chunk_size,
                             overlap,
                             embedding_model.name,
@@ -277,7 +277,9 @@ def run(config_dir: str, filename: str = "config.json"):
                         )
                         logger.info(f"Index: {index_name}")
 
-                        handler.handle_archive_by_index(index_name)
+                        handler.handle_archive_by_index(
+                            index_name, config.EXPERIMENT_NAME, config.JOB_NAME
+                        )
 
                         search_client = create_client(
                             service_endpoint, index_name, search_admin_key
@@ -394,7 +396,12 @@ def run(config_dir: str, filename: str = "config.json"):
                                             context=qna_context,
                                             question=user_prompt,
                                         )
-                                        handler.save(index_name=index_name, data=output)
+                                        handler.save(
+                                            index_name=index_name,
+                                            data=output,
+                                            experiment_name=config.EXPERIMENT_NAME,
+                                            job_name=config.JOB_NAME,
+                                        )
 
                                 except BadRequestError as e:
                                     logger.error(
diff --git a/rag_experiment_accelerator/run/tests/test_querying.py b/rag_experiment_accelerator/run/tests/test_querying.py
index 910db2af..546ee694 100644
--- a/rag_experiment_accelerator/run/tests/test_querying.py
+++ b/rag_experiment_accelerator/run/tests/test_querying.py
@@ -23,7 +23,7 @@ def setUp(self):
         self.mock_config.EF_CONSTRUCTIONS = [400]
         self.mock_config.EF_SEARCHES = [400]
         self.mock_config.SEARCH_VARIANTS = ["search_for_match_semantic"]
-        self.mock_config.NAME_PREFIX = "prefix"
+        self.mock_config.INDEX_NAME_PREFIX = "prefix"
         self.mock_config.RERANK_TYPE = "llm"
         self.mock_config.CHUNK_SIZES = [1]
         self.mock_config.OVERLAP_SIZES = [1]
@@ -277,7 +277,7 @@ def test_run_no_multi_no_rerank(
         mock_config.return_value.EF_CONSTRUCTIONS = [400]
         mock_config.return_value.EF_SEARCHES = [400]
         mock_config.return_value.SEARCH_VARIANTS = ["search_for_match_semantic"]
-        mock_config.return_value.NAME_PREFIX = "prefix"
+        mock_config.return_value.INDEX_NAME_PREFIX = "prefix"
         mock_config.return_value.RERANK = False
         mock_do_we_need_multiple_questions.return_value = False
         mock_query_and_eval_acs.return_value = [MagicMock(), MagicMock()]