From 04350fa70ccc20317162ae373e838de94a8df6d5 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Tue, 21 May 2024 14:12:57 -0600
Subject: [PATCH] Evaluation Data Format Scripts (#115)

* Add tentative script for formatting eval data

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add formatting for entities and tsa

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add docstrings to format script

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Escape commas in tsa / entities

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add subsampling to script for pulling data

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add script to convert alpaca format to sft format

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add notes on formatting and evaluation

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Fix venv guidance

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add hack for simple formatting

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add boto3 to dev deps

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Add file and verbose flags to sft formatter

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* lint pull and fmt script

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

* Update docs & review comments

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

---------

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 pyproject.toml                      |   2 +-
 scripts/alpaca_to_sft_format.py     | 104 +++++++++++++
 scripts/evaluation.md               |  93 ++++++++++++
 scripts/pull_and_format_datasets.py | 228 ++++++++++++++++++++++++++++
 4 files changed, 426 insertions(+), 1 deletion(-)
 create mode 100644 scripts/alpaca_to_sft_format.py
 create mode 100644 scripts/evaluation.md
 create mode 100644 scripts/pull_and_format_datasets.py

diff --git a/pyproject.toml b/pyproject.toml
index 3bff676b5..2e96f9c7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-dev = ["wheel", "packaging", "ninja", "scikit-learn>=1.0, <2.0"]
+dev = ["wheel", "packaging", "ninja", "scikit-learn>=1.0, <2.0", "boto3"]
 flash-attn = ["flash-attn"]
 aim = ["aim==3.19.0"]
 
diff --git a/scripts/alpaca_to_sft_format.py b/scripts/alpaca_to_sft_format.py
new file mode 100644
index 000000000..ad127e363
--- /dev/null
+++ b/scripts/alpaca_to_sft_format.py
@@ -0,0 +1,104 @@
+# Standard
+import argparse
+import os
+
+# Third Party
+import datasets
+
+# Prompt template to be used by default
+SIMPLE_PROMPT = "Input:\n{input}\n\n### Response:"
+
+# Prompt template to be used if --verbose is provided
+VERBOSE_PROMPT_INPUT = (
+    # pylint: disable=line-too-long
+    "Below is an instruction that describes a task, paired with an input that provides further context. "
+    "Write a response that appropriately completes the request.\n\n"
+    "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+)
+VERBOSE_PROMPT_NO_INPUT = (
+    "Below is an instruction that describes a task. "
+    "Write a response that appropriately completes the request.\n\n"
+    "### Instruction:\n{instruction}\n\n### Response:"
+)
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse the arguments and ensure everything is valid.
+
+    Returns:
+        argparse.Namespace
+            Parsed arguments object.
+    """
+    parser = argparse.ArgumentParser(
+        description="Converts Alpaca formatted data files into SFT formatted files."
+    )
+    parser.add_argument(
+        "--verbose",
+        help="Indicates whether or not the verbose format should be used.",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--files",
+        help="Alpaca formatted files to be converted to SFT format.",
+        nargs="+",
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def format_alpaca_fn_simple(example: str) -> dict[str, str]:
+    """Format a single example using the simple template format.
+
+    Args:
+        example: str
+            Example to be formatted.
+
+    Returns:
+        dict[str, str]
+            Dictionary containing the formatted example.
+    """
+    output = SIMPLE_PROMPT.format_map(example)
+    output = f"{output} {example['output']}"
+    return {"output": output}
+
+
+def format_alpaca_fn_verbose(example: str) -> dict[str, str]:
+    """Format a single example using the verbose template format.
+
+    Args:
+        example: str
+            Example to be formatted.
+
+    Returns:
+        dict[str, str]
+            Dictionary containing the formatted example.
+    """
+    output = (
+        VERBOSE_PROMPT_INPUT.format_map(example)
+        if example.get("input", "") != ""
+        else VERBOSE_PROMPT_NO_INPUT.format_map(example)
+    )
+    output = f"{output} {example['output']}"
+    return {"output": output}
+
+
+if __name__ == "__main__":
+    parsed_args = parse_args()
+    format_alpaca_fn = (
+        format_alpaca_fn_verbose if parsed_args.verbose else format_alpaca_fn_simple
+    )
+
+    for file_path in parsed_args.files:
+        if not os.path.isfile(file_path):
+            raise ValueError(f"alpaca dataset f{file_path} does not exist!")
+        base_path, file_name = os.path.split(file_path)
+        export_path = os.path.join(base_path, f"sft_format_{file_name}")
+        print(
+            f"Converting alpaca format file: {file_path} to SFT trainer training format"
+        )
+        ds = datasets.load_dataset("json", data_files=file_path)
+        alpaca_ds = ds["train"].map(
+            format_alpaca_fn, remove_columns=["instruction", "input"]
+        )
+        alpaca_ds.to_json(export_path)
+        print(f"Exported SFT format data to {export_path}")
diff --git a/scripts/evaluation.md b/scripts/evaluation.md
new file mode 100644
index 000000000..379568374
--- /dev/null
+++ b/scripts/evaluation.md
@@ -0,0 +1,93 @@
+# Data Formatting / Evaluation
+This doc describes how to pull & format datasets into something that can be run against this repository for evaluation.
+
+## Pulling and Converting Datasets Into Alpaca Format
+In order to pull and format the datasets, you'll need to set the following environment variables:
+
+```bash
+export S3_ACCESS_KEY_ID={YOUR S3 KEY}
+export S3_SECRET_ACCESS_KEY={YOUR S3 SECRET ACCESS KEY}
+export S3_ENDPOINT={YOUR S3 ENDPOINT}
+```
+
+Next, pull and format the datasets. In order to run the data formatting & evaluation, you'll need a few extra dependencies, namely `boto3` and `scikit-learn`, which you can pull from the dev dependencies.
+
+NOTE: If you are running this from inside of a container with the library already installed, the easiest way to run evaluation is to copy & install the project's dev dependencies e.g., with `pip3 install .[dev]`, inside of a virtual environment in the container.
+
+To pull and format the datasets into Alpaca format, run the following command.
+```bash
+python3 pull_and_format_datasets.py
+```
+
+Make sure you see everything under `formatted`! It should look something like this. Note that the numeric prefix for train/test files indicates how many samples were randomly sampled into the Alpaca formatted file:
+```bash
+ls -R formatted_data/
+
+formatted_data/:
+Entities  cc_tone  tsa_mams
+
+formatted_data/Entities:
+1000_IBM_NET_2019_DELIVERABLE_VERSION_C_train.json  IBM_NET_2019_DELIVERABLE_VERSION_C_dev.json
+500_IBM_NET_2019_DELIVERABLE_VERSION_C_test.json
+
+formatted_data/cc_tone:
+1000_train.json  500_test.json
+
+formatted_data/tsa_mams:
+1000_train.json  500_test.json validation.json
+```
+
+## Converting Alpaca Format Datasets Into SFT Format
+
+In order to run a tuning on the datasets, you'll need to convert the Alpaca format datasets into the SFT format. You can do this by with `alpaca_to_sft_format.py`; pass each Alpaca formatted file that you want to convert as part of the `--files` argument, as shown below.
+
+```bash
+python3 alpaca_to_sft_format.py \
+    --files \
+    formatted_data/cc_tone/1000_train.json \
+    formatted_data/Entities/1000_IBM_NET_2019_DELIVERABLE_VERSION_C_train.json \
+    formatted_data/tsa_mams/1000_train.json
+```
+
+Now, you should have a `sft_format_{file_name}` in the same location as each of your aforementioned files. E.g.,
+```bash
+ls formatted_data/cc_tone/
+
+1000_train.json # Alpaca format - don't worry about this one anymore
+500_test.json # You will use this one for evaluation
+sft_format_1000_train.json # You will use this one for tuning
+
+# And anything for validation, you can ignore
+```
+
+### Adding New Datasets to Pull / Format Script
+The above command pulls and formats different datasets in the Alpaca format. In general, this needs to be updated per dataset being consumed, as it is dependent on the raw format of the dataset being converted.
+
+To add a new dataset, you'll need to add a new entry to the `DATASET_INFOS` in `pull_and_format_datasets.py`, which includes:
+
+- The location in the COS instance to pull from; it's assumed that all datasets live in the same COS instance
+- A formatting function which loads the raw dataset and exports it in the expected format for the task being considered; there are currently examples for entities/TSA and tone. Running the script and inspecting the Alpaca formatted examples for these datasets is the quickest way to understand the format for those tasks. Succinctly:
+    - For tone, the output per example is the labels separated by `, `, e.g., `"polite, excited, satisfied, sympathetic"`
+    - For TSA/entities, the output per example is the entities in the format `<entity>: <type>` join by `, `, e.g., `engineer: JobTitle, Thursday: Date`, or `"atmosphere: positive, drinks: neutral"`
+
+### Running a Tuning
+There are some resources for how to run a tuning against a cluster that you can find in the [wiki](https://github.com/foundation-model-stack/fms-hf-tuning/wiki/Installing-and-Testing-OpenShift-fms%E2%80%90hf%E2%80%90tuning-Stack#6-testing) which should be useful. If you are planning to run a tuning / evaluation from inside of a running container with GPUs available, the easiest way to configure your tuning job is to create a local `config.json` with your training specs that is analogous to the mounted configmap, and repoint the `SFT_TRAINER_CONFIG_JSON_PATH` env var to that local config. After doing this, you can trigger a tuning by running `python3 /app/accelerate_launch.py`.
+
+There are a few good things to be careful of here:
+
+- When tuning, make sure to use `"\n### Response:"` as your response template
+- If you see a bunch of warnings like `Could not find response key `[19371, 27]` in the following instance:`, you're doing something wrong! Most likely offenders are pointing tuning at the Alpaca data, or using the response template. This should not happen.
+- Make sure your `SFT_TRAINER_CONFIG_JSON_PATH` is pointing at the right place if you're inside of a pod with a mounted configmap, otherwise your tuning will be pointing at the wrong config!
+
+### Running Evaluation
+To run the evaluation, point the evaluation script at your exported model and test data. For example:
+
+```bash
+python3 run_evaluation.py \
+    --model <YOUR_TUNED_MODEL>  \
+    --delimiter , \
+    --data_path formatted_data/cc_tone/500_test.json \
+    --max_new_tokens 100
+```
+
+To understand the files created by the evaluation, check out the comments in this [PR](https://github.com/foundation-model-stack/fms-hf-tuning/pull/102).
diff --git a/scripts/pull_and_format_datasets.py b/scripts/pull_and_format_datasets.py
new file mode 100644
index 000000000..00287988f
--- /dev/null
+++ b/scripts/pull_and_format_datasets.py
@@ -0,0 +1,228 @@
+"""Pulls and formats data into alpaca format. Not that each specific dataset needs its own
+formatter, since it is dependent on the format of the raw dataset being processed.
+"""
+
+# Standard
+from shutil import rmtree
+from typing import Any, Optional
+import json
+import os
+import random
+
+# Third Party
+import boto3
+
+S3_ACCESS_KEY_ID = os.getenv("S3_ACCESS_KEY_ID")
+S3_SECRET_ACCESS_KEY = os.getenv("S3_SECRET_ACCESS_KEY")
+S3_ENDPOINT = os.getenv("S3_ENDPOINT")
+if S3_ACCESS_KEY_ID is None or S3_SECRET_ACCESS_KEY is None or S3_ENDPOINT is None:
+    raise ValueError(
+        "Error - must set env vars: S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, and S3_ENDPOINT"
+    )
+
+##### data formatters
+def format_and_export_cc_tone_file(
+    file_path: str, export_path: str, num_samples: Optional[int]
+):
+    """Formats the tone dataset by comma separated labels in the output.
+
+    Args:
+        file_path: str
+            Path to tone file to be formatted.
+        export_path: str
+            Path to export the formatted data to.
+        num_samples: Optional[int]
+            Number of samples to be included in the formatted file.
+    """
+    with open(file_path, "r", encoding="utf-8") as tone_file:
+        data = [json.loads(line) for line in tone_file.readlines() if line]
+
+    export_path = export_path.split(".")[0] + ".json"
+    if num_samples:
+        data = random.sample(data, num_samples)
+        # Update the file name to prepend num samples
+        base_path, export_name = os.path.split(export_path)
+        export_path = os.path.join(base_path, f"{num_samples}_{export_name}")
+
+    formatted_data = [
+        {
+            "instruction": "",
+            "input": datum["text"],
+            "output": ", ".join(datum["labels"]),
+        }
+        for datum in data
+    ]
+    with open(export_path, "w", encoding="utf-8") as export_file:
+        json.dump(formatted_data, export_file, sort_keys=True, indent=4)
+
+
+def format_and_export_entities_file(
+    file_path: str, export_path: str, num_samples: Optional[int]
+):
+    """Formats the entites/TSA datasets by setting the output to literal "None"
+    if no target is extracted, and a comma separated list in the format
+                                {text} : {type}
+    for each extracted object.
+    Example for entities: "waitress: JobTitle"
+    Example for TSA: "waitress: positive"
+
+    Args:
+        file_path: str
+            Path to tone file to be formatted.
+        export_path: str
+            Path to export the formatted data to.
+        num_samples: Optional[int]
+            Number of samples to be included in the formatted file.
+    """
+
+    def get_entites_output_text(datum):
+        mentions = datum["mentions"]
+        if not mentions:
+            return "None"
+        mention_strs = [
+            f"{mention['text']}: {mention['type']}".replace(",", "\\,")
+            for mention in mentions
+        ]
+        return ", ".join(mention_strs)
+
+    with open(file_path, "r", encoding="utf-8") as entities_file:
+        data = json.load(entities_file)
+    if num_samples:
+        data = random.sample(data, num_samples)
+        # Update the file name to prepend num samples
+        base_path, export_name = os.path.split(export_path)
+        export_path = os.path.join(base_path, f"{num_samples}_{export_name}")
+
+    formatted_data = [
+        {
+            "instruction": "",
+            "input": datum["text"],
+            "output": get_entites_output_text(datum),
+        }
+        for datum in data
+    ]
+    with open(export_path, "w", encoding="utf-8") as export_file:
+        json.dump(formatted_data, export_file, sort_keys=True, indent=4)
+
+
+# Where we will put the downloaded data
+DOWNLOAD_DIR = "unformatted_data"
+# Where we will put the formatted data files to
+EXPORT_DIR = "formatted_data"
+
+COS_LOCATION_KEY = "cos_location"
+FORMAT_FUNC_KEY = "format_func"
+SUBSAMPLE_KEY = "subsample_info"
+DATASET_INFOS = [
+    {
+        COS_LOCATION_KEY: "fm-validation-staging-models-and-datasets/datasets/unitxt/cc_tone",
+        FORMAT_FUNC_KEY: format_and_export_cc_tone_file,
+        SUBSAMPLE_KEY: {
+            "train.jsonl": 1000,
+            "test.jsonl": 500,
+        },
+    },
+    {
+        # pylint: disable=line-too-long
+        COS_LOCATION_KEY: "fm-validation-staging-models-and-datasets/datasets/unitxt/en/Extraction/Entities",
+        FORMAT_FUNC_KEY: format_and_export_entities_file,
+        SUBSAMPLE_KEY: {
+            "IBM_NET_2019_DELIVERABLE_VERSION_C_train.json": 1000,
+            "IBM_NET_2019_DELIVERABLE_VERSION_C_test.json": 500,
+        },
+    },
+    {
+        COS_LOCATION_KEY: "fm-validation-staging-models-and-datasets/datasets/unitxt/tsa_mams",
+        FORMAT_FUNC_KEY: format_and_export_entities_file,
+        SUBSAMPLE_KEY: {
+            "train.json": 1000,
+            "test.json": 500,
+        },
+    },
+]
+
+
+def create_data_dirs():
+    """Create the directories to contain formatted/unformatted data."""
+    print("Creating data directories...")
+    if os.path.exists(DOWNLOAD_DIR):
+        rmtree(DOWNLOAD_DIR)
+    if os.path.exists(EXPORT_DIR):
+        rmtree(EXPORT_DIR)
+    os.mkdir(DOWNLOAD_DIR)
+    os.mkdir(EXPORT_DIR)
+
+
+def download_datasets(dataset_infos: list[dict[str, Any]]):
+    """Download the datasets to local disk.
+
+    Args:
+        dataset_infos: list[dict[str, Any]]
+            Structure containing information about each dataset we need to download
+            and where it lives in the connected S3 instance.
+    """
+    s3 = boto3.resource(
+        "s3",
+        aws_access_key_id=S3_ACCESS_KEY_ID,
+        aws_secret_access_key=S3_SECRET_ACCESS_KEY,
+        endpoint_url=S3_ENDPOINT,
+    )
+    for dataset_info in dataset_infos:
+        first_slash_idx = dataset_info[COS_LOCATION_KEY].find("/")
+        bucket_name = dataset_info[COS_LOCATION_KEY][:first_slash_idx]
+        cos_path = dataset_info[COS_LOCATION_KEY][first_slash_idx + 1 :]
+        # Make the subdir to download files into...
+        download_subdir = os.path.join(DOWNLOAD_DIR, cos_path.split(os.sep)[-1])
+        os.mkdir(download_subdir)
+        print(f"Downloading files for {download_subdir}")
+        # Download the unformatted data files...
+        bucket = s3.Bucket(bucket_name)
+        for obj in bucket.objects.filter(Prefix=cos_path):
+            if obj.key[-1] == "/":
+                continue
+            target_path = os.path.join(download_subdir, obj.key.split("/")[-1])
+            bucket.download_file(obj.key, target_path)
+
+
+def apply_data_formatters(dataset_infos: list[dict[str, Any]]):
+    """Formats each of the downloaded datasets.
+
+    Args:
+        dataset_infos: list[dict[str, Any]]
+            Structure containing information about each dataset we need to format.
+    """
+    for dataset_info in dataset_infos:
+        subdir = dataset_info[COS_LOCATION_KEY].split(os.sep)[-1]
+        download_subdir = os.path.join(DOWNLOAD_DIR, subdir)
+        # Make the dir to export files to
+        export_subdir = os.path.join(EXPORT_DIR, subdir)
+        os.mkdir(export_subdir)
+
+        data_files = [
+            os.path.join(download_subdir, filename)
+            for filename in os.listdir(download_subdir)
+        ]
+        for data_file in data_files:
+            # Apply this datasets formatter to the data file
+            export_path = os.path.join(
+                EXPORT_DIR, data_file[data_file.index(os.sep) + 1 :]
+            )
+            dataset_info[SUBSAMPLE_KEY]
+            # This is silly :)
+            filename = os.path.split(data_file)[-1]
+            num_samples = (
+                dataset_info[SUBSAMPLE_KEY][filename]
+                if filename in dataset_info[SUBSAMPLE_KEY]
+                else None
+            )
+            if num_samples:
+                print(f"--> File: {data_file} will be subsampled to {num_samples}")
+                # make sure the split is deterministic
+                random.seed(42)
+            dataset_info[FORMAT_FUNC_KEY](data_file, export_path, num_samples)
+
+
+if __name__ == "__main__":
+    create_data_dirs()
+    download_datasets(DATASET_INFOS)
+    apply_data_formatters(DATASET_INFOS)