Support generating reliability test programs and inputs (#1802)

* Commit Signed-off-by: dbczumar <[email protected]> * fix Signed-off-by: dbczumar <[email protected]> * fix Signed-off-by: dbczumar <[email protected]> * fix Signed-off-by: dbczumar <[email protected]> * fix Signed-off-by: dbczumar <[email protected]> * fix Signed-off-by: dbczumar <[email protected]> --------- Signed-off-by: dbczumar <[email protected]>
stanfordnlp · Nov 16, 2024 · 83f24f9 · 83f24f9
1 parent 1f93bff
commit 83f24f9
Show file tree

Hide file tree

Showing 8 changed files with 888 additions and 29 deletions.
diff --git a/tests/reliability/README.md b/tests/reliability/README.md
@@ -47,6 +47,31 @@ Each test in this directory executes a DSPy program using various LLMs. By runni
 
   This will execute all tests for the configured models and display detailed results for each model configuration. Tests are set up to mark expected failures for known challenging cases where a specific model might struggle, while actual (unexpected) DSPy reliability issues are flagged as failures (see below).
 
+#### Running specific generated tests
+
+You can run specific generated tests by using the `-k` flag with `pytest`. For example, to test the generated program located at `tests/reliability/complex_types/generated/test_nesting_1` against generated test input `input1.json`, you can run the following command from this directory:
+
+```bash
+pytest test_generated.py -k "test_nesting_1-input1"
+```
+
+### Test generation
+
+You can generate test DSPy programs and test inputs from text descriptions using the `tests.reliability.generate` CLI, or the `tests.reliability.generate.generate_test_cases` API. For example, to generate a test classification program and 3 challenging test inputs in the `tests/reliability/classification/generated` directory, you can run the following command from the DSPy repository root directory:
+
+```bash
+python \
+    -m tests.reliability.generate \
+    -d tests/reliability/classification/generated/test_example \
+    -p "Generate a program that performs a classification task involving objects with multiple properties. The task should be realistic" \
+    -i "Based on the program description, generate a challenging example" \
+    -n 3
+```
+
+The test program will be written to `tests/reliability/classification/generated/test_example/program.py`, and the test inputs will be written as JSON files to the `tests/reliability/classification/generated/test_exaple/inputs/` directory.
+
+All generated tests should be located in directories with the structure `tests/reliability/<test_type>/generated/<test_name>`, where `<test_type>` is the type of test (e.g., `classification`, `complex_types`, `chat`, etc.), and `<test_name>` is a descriptive name for the test.
+
 ### Known Failing Models
 
 Some tests may be expected to fail with certain models, especially in challenging cases. These known failures are logged but do not affect the overall test result. This setup allows us to keep track of model-specific limitations without obstructing general test outcomes. Models that are known to fail a particular test case are specified using the `@known_failing_models` decorator. For example:

diff --git a/tests/reliability/conftest.py b/tests/reliability/conftest.py
@@ -4,7 +4,7 @@
 
 import dspy
 from tests.conftest import clear_settings
-from tests.reliability.utils import parse_reliability_conf_yaml
+from tests.reliability.utils import get_adapter, parse_reliability_conf_yaml
 
 # Standard list of models that should be used for periodic DSPy reliability testing
 MODEL_LIST = [
@@ -46,13 +46,7 @@ def configure_model(request):
     module_dir = os.path.dirname(os.path.abspath(__file__))
     conf_path = os.path.join(module_dir, "reliability_conf.yaml")
     reliability_conf = parse_reliability_conf_yaml(conf_path)
-
-    if reliability_conf.adapter.lower() == "chat":
-        adapter = dspy.ChatAdapter()
-    elif reliability_conf.adapter.lower() == "json":
-        adapter = dspy.JSONAdapter()
-    else:
-        raise ValueError(f"Unknown adapter specification '{adapter}' in reliability_conf.yaml")
+    adapter = get_adapter(reliability_conf)
 
     model_name, should_ignore_failure = request.param
     model_params = reliability_conf.models.get(model_name)
@@ -61,7 +55,9 @@ def configure_model(request):
         dspy.configure(lm=lm, adapter=adapter)
     else:
         pytest.skip(
-            f"Skipping test because no reliability testing YAML configuration was found" f" for model {model_name}."
+            f"Skipping test because no reliability testing YAML configuration was found"
+            f" for model {model_name}, or the YAML configuration is missing LiteLLM parameters"
+            f" for this model ('litellm_params' section of conf file is missing)."
         )
 
     # Store `should_ignore_failure` flag on the request node for use in post-test handling

diff --git a/tests/reliability/generate/__init__.py b/tests/reliability/generate/__init__.py
@@ -0,0 +1,38 @@
+import os
+from typing import List, Optional
+
+from tests.reliability.generate.utils import (
+    GeneratedTestCase,
+    generate_test_inputs,
+    generate_test_program,
+    load_generated_cases,
+    load_generated_program,
+)
+
+
+def generate_test_cases(
+    dst_path: str,
+    num_inputs: int = 1,
+    program_instructions: Optional[str] = None,
+    input_instructions: Optional[str] = None,
+) -> List[GeneratedTestCase]:
+    os.makedirs(dst_path, exist_ok=True)
+    if _directory_contains_program(dst_path):
+        print(f"Found an existing test program at path {dst_path}. Generating new" f" test inputs for this program.")
+    else:
+        print("Generating a new test program and test inputs")
+        generate_test_program(
+            dst_path=dst_path,
+            additional_instructions=program_instructions,
+        )
+    generate_test_inputs(
+        dst_path=os.path.join(dst_path, "inputs"),
+        program_path=os.path.join(dst_path, "program.py"),
+        num_inputs=num_inputs,
+        additional_instructions=input_instructions,
+    )
+    return load_generated_cases(dir_path=dst_path)
+
+
+def _directory_contains_program(dir_path: str) -> bool:
+    return any(file == "program.py" for file in os.listdir(dir_path))
diff --git a/tests/reliability/generate/__main__.py b/tests/reliability/generate/__main__.py
@@ -0,0 +1,29 @@
+import argparse
+
+from tests.reliability.generate import generate_test_cases
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate test cases by specifying configuration and input instructions."
+    )
+    parser.add_argument(
+        "-d", "--dst_path", type=str, required=True, help="Destination path where generated test cases will be saved."
+    )
+    parser.add_argument(
+        "-n", "--num_inputs", type=int, default=1, help="Number of input cases to generate (default: 1)."
+    )
+    parser.add_argument(
+        "-p", "--program_instructions", type=str, help="Additional instructions for the generated test program."
+    )
+    parser.add_argument(
+        "-i", "--input_instructions", type=str, help="Additional instructions for generating test inputs."
+    )
+
+    args = parser.parse_args()
+
+    generate_test_cases(
+        dst_path=args.dst_path,
+        num_inputs=args.num_inputs,
+        program_instructions=args.program_instructions,
+        input_instructions=args.input_instructions,
+    )