Skip to content

Commit

Permalink
RELEASE: 0.0.2a01
Browse files Browse the repository at this point in the history
  • Loading branch information
brucewlee committed May 29, 2024
1 parent 9bdb4c7 commit 2a7f0f5
Show file tree
Hide file tree
Showing 20 changed files with 713 additions and 287 deletions.
4 changes: 4 additions & 0 deletions nutcracker/data/data_config/task/mfq-30.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
filename:
test: test.json
config: config.yaml
task_name: mfq-30
4 changes: 4 additions & 0 deletions nutcracker/data/data_config/task/pvq-rr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
filename:
test: test.json
config: config.yaml
task_name: pvq-rr
50 changes: 50 additions & 0 deletions nutcracker/data/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,17 @@ def create_instance(
# example_data_list can be empty for zero-shot evaluation
example_data_list = kwargs.get('example_data_list')
return MCQInstance(config, test_data, example_data_list)

elif config['construction']['class'].lower().strip() == "mcq-survey":
# example_data_list can be empty for zero-shot evaluation
example_data_list = kwargs.get('example_data_list')
return MCQSurveyInstance(config, test_data, example_data_list)

elif config['construction']['class'].lower().strip() == "frq":
# example_data_list can be empty for zero-shot evaluation
example_data_list = kwargs.get('example_data_list')
return FRQInstance(config, test_data, example_data_list)

else:
raise ValueError("Invalid instance construction")

Expand Down Expand Up @@ -169,6 +176,49 @@ def _format_user_prompt(self) -> str:



class MCQSurveyInstance(MCQInstance):
def __init__(
self,
config: dict,
test_data: dict,
example_data_list: List[dict] = None
) -> None:
"""`MCQInstance` constructor.
Args:
config (dict): Configuration for the instance.
test_data (dict): Test data for the instance.
example_data_list (List[dict], optional): List of example data. Defaults to None.
Raises:
None
Returns:
None
"""
# task-level attributes
self.config = config
self.example_data_list = example_data_list # not a list of Instance, just a list of dicts

# below are attributes that are initialized from data
self.centerpiece = test_data["centerpiece"]
self.options = test_data["options"]
self.correct_options = None
self.question_number = test_data["question_number"]

# Check if 'context' key exists in test_data
self.context_exists = 'context' in test_data and test_data['context']
if self.context_exists:
self.context = test_data["context"]

# below are derivational attributes that will be updated during code run
self.user_prompt = self._format_user_prompt()
self.model_response = None
self.model_response_logprobs = None
self.response_correct = False
self.response_evaluator = None


class FRQInstance(Instance):
def __init__(
self,
Expand Down
1 change: 0 additions & 1 deletion nutcracker/data/instance_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#
#
import numpy as np
from scipy.spatial.distance import pdist, squareform
#
#
#
Expand Down
4 changes: 3 additions & 1 deletion nutcracker/evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from nutcracker.evaluator.mcq_evaluator import MCQEvaluator
from nutcracker.evaluator.frq_evaluator import FRQEvaluator
from nutcracker.evaluator.auto_evaluator import AutoEvaluator
from nutcracker.evaluator.reporter import generate_report
from nutcracker.evaluator.reporter.reporter import generate_report
from nutcracker.evaluator.reporter.mfq_reporter import mfq_30_generate_report
from nutcracker.evaluator.reporter.pvq_reporter import pvq_rr_generate_report
20 changes: 15 additions & 5 deletions nutcracker/evaluator/auto_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,44 @@
from typing import List, Union
import logging
import logging, os
#
from nutcracker.data.task import Task
from nutcracker.data.pile import Pile
from nutcracker.data.instance import MCQInstance
from nutcracker.data.instance import MCQInstance, MCQSurveyInstance
from nutcracker.data.instance import FRQInstance
from nutcracker.evaluator.mcq_evaluator import MCQEvaluator
from nutcracker.evaluator.frq_evaluator import FRQEvaluator
from nutcracker.models import *
#
#
class AutoEvaluator:
def __init__(self, data: Union[Pile, Task, List[MCQInstance], List[FRQInstance]], mcq_judge = 'recommended', frq_judge = 'recommended', **judge_kwargs) -> None:
def __init__(
self,
data: Union[Pile, Task, List[MCQInstance], List[FRQInstance]],
model = OpenAI_ChatGPT(api_key=os.getenv("OPENAI_API_KEY", "default_api_key")),
mcq_judge = 'recommended',
frq_judge = 'recommended',
**judge_kwargs
) -> None:
self.data = data
self.judge_kwargs = judge_kwargs
self._control_logging()
self.frq_judge = frq_judge
self.mcq_judge = mcq_judge
self.model = model

def run(self, round_digits: int = 5) -> float:
mcq_data = [instance for instance in self.data if isinstance(instance, MCQInstance)]
mcq_survey_data = [instance for instance in self.data if isinstance(instance, (MCQInstance, MCQSurveyInstance))]
frq_data = [instance for instance in self.data if isinstance(instance, FRQInstance)]
self.logger.info(f"found {len(mcq_data)} MCQInstances.")
self.logger.info(f"found {len(frq_data)} FRQInstances.")

if mcq_data:
mcq_evaluator = MCQEvaluator(mcq_data, judge=self.mcq_judge, **self.judge_kwargs)
mcq_evaluator = MCQEvaluator(mcq_data, self.model,judge=self.mcq_judge, **self.judge_kwargs)
mcq_evaluator.run(round_digits)

if frq_data:
frq_evaluator = FRQEvaluator(frq_data, judge=self.frq_judge, **self.judge_kwargs)
frq_evaluator = FRQEvaluator(frq_data, self.model,judge=self.frq_judge, **self.judge_kwargs)
frq_evaluator.run(round_digits)

# This function currently does not return accuracy. Modify as needed or use separate reporting.
Expand Down
15 changes: 6 additions & 9 deletions nutcracker/evaluator/frq_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,17 @@
from nutcracker.data.pile import Pile
from nutcracker.data.instance import FRQInstance
from nutcracker.utils import TqdmLoggingHandler
from nutcracker.evaluator.judges.frq_judge_alpha import FRQJudgeAlpha
from nutcracker.evaluator.judges.frq_judge_beta import FRQJudgeBeta
from nutcracker.evaluator.judges.frq_judge import FRQJudge
from nutcracker.models import *
#
#
class FRQEvaluator:
def __init__(
self, data: Union[Pile, Task, List[FRQInstance]], judge: str = 'alpha', **judge_kwargs) -> None:
self, data: Union[Pile, Task, List[FRQInstance]], model, judge: str = 'alpha', **judge_kwargs) -> None:
self.data = data
if judge == 'alpha' or judge == 'recommended':
self.response_evaluator_judge = 'frq-judge-alpha'
self.judge = FRQJudgeAlpha(**judge_kwargs)
elif judge == 'beta' or judge == 'recommended':
self.response_evaluator_judge = 'frq-judge-beta'
self.judge = FRQJudgeBeta(**judge_kwargs)
self.model = model
self.response_evaluator_judge = f'frq-judge-{self.model}'
self.judge = FRQJudge(self.model)
self._control_logging()


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@
#
#
#
class FRQJudgeAlpha:
def __init__(self):
# Use the kwargs as needed for configuration
# Example: self.some_setting = kwargs.get('some_setting', default_value)
pass
class FRQJudge:
def __init__(self, model):
self.model = model



Expand All @@ -29,8 +27,7 @@ def is_correct(self, instance: FRQInstance) -> bool:



@staticmethod
def _parse_model_response_intent_matching(response: str, correct_options: list) -> str:
def _parse_model_response_intent_matching(self, response: str, correct_options: list) -> str:
client = OpenAI()
few_shot = f"""
Your job is: given a response, determine if the answer is correct or not. Say True or False and nothing else.
Expand Down Expand Up @@ -70,20 +67,12 @@ def _parse_model_response_intent_matching(response: str, correct_options: list)
interpreted_response = None
while interpreted_response is None:
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": 'You respond with True or False and nothing else.'},
{"role": "user", "content": few_shot}
],
seed=123456789,
timeout=15,
temperature=1
completion = self.model.respond(
'You respond with True or False and nothing else.\n\n' + few_shot
)
interpreted_response = completion.choices[0].message.content.strip()
interpreted_response = completion.strip().upper()
break
except openai.BadRequestError:
interpreted_response = "None"
except KeyboardInterrupt:
sys.exit()
return interpreted_response

return set(interpreted_response.split(', ')) if interpreted_response else set()
29 changes: 0 additions & 29 deletions nutcracker/evaluator/judges/frq_judge_beta.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
from typing import Set, Optional
#
from nutcracker.data.instance import MCQInstance
from nutcracker.models import *
#
#
from openai import OpenAI
#
#
#
class MCQJudgeZeta:
def __init__(self):
# Use the kwargs as needed for configuration
# Example: self.some_setting = kwargs.get('some_setting', default_value)
pass

class MCQJudge:
def __init__(self, model):
self.model = model


def is_correct(self, instance: MCQInstance) -> bool:
# First, try rule-based parsing
found_options = self._parse_model_response_rule_based(instance.model_response)

if instance.correct_options:
correct_options_set = set(instance.correct_options)
else:
correct_options_set = None

# If rule-based parsing fails or is ambiguous and intent matching is not disabled, use intent-matching
if not found_options:
Expand All @@ -26,9 +25,8 @@ def is_correct(self, instance: MCQInstance) -> bool:
return False # Consider not rule-matched responses as wrong if intent matching is disabled

instance.response_parsed = found_options
correct_options_set = set(instance.correct_options)

return found_options == correct_options_set
return found_options == correct_options_set, found_options



Expand Down Expand Up @@ -64,65 +62,70 @@ def _parse_model_response_rule_based(response: str) -> Optional[Set[str]]:



@staticmethod
def _parse_model_response_intent_matching(response: str) -> Set[str]:
client = OpenAI()
def _parse_model_response_intent_matching(self, response: str) -> Set[str]:
few_shot = f"""
Your job is: given a response, determine to which option the response is potining to. That is, classify a given response to discrete labels: A, B, C, D, E, ..., Z, or None (if the response is pointing to multiple labels give multiple).
Examples:
Example 1 - Clear Single Response
Response: 'The answer is A.'
Interpretation: A
Example 2 - Clear Multi Response
Response: 'I believe B and C are correct.'
Interpretation: B, C
Example 3 - Clear Single Response
Response: 'Definitely D.'
Interpretation: D
Example 4 - Clear Single Response
Response: 'Although many think it's A, the correct answer is actually D.'
Interpretation: D
Example 5 - Clear Multi Response
Response: 'A seems right, but after further analysis, B and D are more accurate.'
Interpretation: B, D
Example 6 - Not a Response
Response: 'Question: Which of the following will cause a factory'
Interpretation: None
Example 7 - Clear Single Response
Response: 'Options A and B are incorrect, so it must be C.'
Interpretation: C
Example 8 - Not a Response
Response: 'Please choose the answer you think is correct and press the submit answer button.'
Interpretation: None
Example 9 - Clear Single Response
Response: 'Either A or B could work, but I lean towards B.'
Interpretation: B
Example 10 - Clear Single Response
Response: 'A. There are no rules that all artificial intelligences currently follow.'
Interpretation: A
Example 11 - Response Includes Another Question
Response: 'Question: The process of creating new concepts, ideas, and innovations is called A. invention. B. creativity. C. technology. D. entrepreneurship.? Answer: B'
Interpretation: None
Example 12 - Clear Single Response
Response: 'Answer: E. Like you.'
Interpretation: E
Now consider,
Response: '{response}'
Interpretation:
"""

interpreted_response = None
while interpreted_response is None:
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": 'You respond with the letter option (like A, B, D, or None) separated by commas and nothing else.'},
{"role": "user", "content": few_shot}
],
seed=123456789,
timeout=15,
temperature=1
completion = self.model.respond(
'You respond with the letter option (like A, B, D, F, or None) separated by commas and nothing else.\n\n' + few_shot
)
interpreted_response = completion.choices[0].message.content.strip().upper()
interpreted_response = completion.strip().upper()
break
except KeyboardInterrupt:
sys.exit()
Expand Down
Loading

0 comments on commit 2a7f0f5

Please sign in to comment.