From 2a7f0f5aab00da415c85493444434b6871b08d13 Mon Sep 17 00:00:00 2001 From: Bruce Lee Date: Wed, 29 May 2024 12:49:37 -0700 Subject: [PATCH] RELEASE: 0.0.2a01 --- nutcracker/data/data_config/task/mfq-30.yaml | 4 + nutcracker/data/data_config/task/pvq-rr.yaml | 4 + nutcracker/data/instance.py | 50 +++ nutcracker/data/instance_collection.py | 1 - nutcracker/evaluator/__init__.py | 4 +- nutcracker/evaluator/auto_evaluator.py | 20 +- nutcracker/evaluator/frq_evaluator.py | 15 +- .../{frq_judge_alpha.py => frq_judge.py} | 29 +- nutcracker/evaluator/judges/frq_judge_beta.py | 29 -- .../{mcq_judge_zeta.py => mcq_judge.py} | 59 ++-- .../evaluator/judges/mcq_judge_alpha.py | 92 ------ nutcracker/evaluator/judges/mcq_judge_beta.py | 47 --- .../evaluator/judges/mcq_judge_gamma.py | 33 -- nutcracker/evaluator/mcq_evaluator.py | 26 +- nutcracker/evaluator/reporter/mfq_reporter.py | 93 ++++++ nutcracker/evaluator/reporter/pvq_reporter.py | 168 ++++++++++ .../evaluator/{ => reporter}/reporter.py | 0 nutcracker/models.py | 305 ++++++++++++++++++ requirements.txt | 11 + setup.py | 10 +- 20 files changed, 713 insertions(+), 287 deletions(-) create mode 100644 nutcracker/data/data_config/task/mfq-30.yaml create mode 100644 nutcracker/data/data_config/task/pvq-rr.yaml rename nutcracker/evaluator/judges/{frq_judge_alpha.py => frq_judge.py} (65%) delete mode 100644 nutcracker/evaluator/judges/frq_judge_beta.py rename nutcracker/evaluator/judges/{mcq_judge_zeta.py => mcq_judge.py} (74%) delete mode 100644 nutcracker/evaluator/judges/mcq_judge_alpha.py delete mode 100644 nutcracker/evaluator/judges/mcq_judge_beta.py delete mode 100644 nutcracker/evaluator/judges/mcq_judge_gamma.py create mode 100644 nutcracker/evaluator/reporter/mfq_reporter.py create mode 100644 nutcracker/evaluator/reporter/pvq_reporter.py rename nutcracker/evaluator/{ => reporter}/reporter.py (100%) create mode 100644 nutcracker/models.py create mode 100644 requirements.txt diff --git a/nutcracker/data/data_config/task/mfq-30.yaml b/nutcracker/data/data_config/task/mfq-30.yaml new file mode 100644 index 0000000..6d88414 --- /dev/null +++ b/nutcracker/data/data_config/task/mfq-30.yaml @@ -0,0 +1,4 @@ +filename: + test: test.json + config: config.yaml +task_name: mfq-30 diff --git a/nutcracker/data/data_config/task/pvq-rr.yaml b/nutcracker/data/data_config/task/pvq-rr.yaml new file mode 100644 index 0000000..97af128 --- /dev/null +++ b/nutcracker/data/data_config/task/pvq-rr.yaml @@ -0,0 +1,4 @@ +filename: + test: test.json + config: config.yaml +task_name: pvq-rr diff --git a/nutcracker/data/instance.py b/nutcracker/data/instance.py index 464a150..9ef91d2 100644 --- a/nutcracker/data/instance.py +++ b/nutcracker/data/instance.py @@ -32,10 +32,17 @@ def create_instance( # example_data_list can be empty for zero-shot evaluation example_data_list = kwargs.get('example_data_list') return MCQInstance(config, test_data, example_data_list) + + elif config['construction']['class'].lower().strip() == "mcq-survey": + # example_data_list can be empty for zero-shot evaluation + example_data_list = kwargs.get('example_data_list') + return MCQSurveyInstance(config, test_data, example_data_list) + elif config['construction']['class'].lower().strip() == "frq": # example_data_list can be empty for zero-shot evaluation example_data_list = kwargs.get('example_data_list') return FRQInstance(config, test_data, example_data_list) + else: raise ValueError("Invalid instance construction") @@ -169,6 +176,49 @@ def _format_user_prompt(self) -> str: +class MCQSurveyInstance(MCQInstance): + def __init__( + self, + config: dict, + test_data: dict, + example_data_list: List[dict] = None + ) -> None: + """`MCQInstance` constructor. + + Args: + config (dict): Configuration for the instance. + test_data (dict): Test data for the instance. + example_data_list (List[dict], optional): List of example data. Defaults to None. + + Raises: + None + + Returns: + None + """ + # task-level attributes + self.config = config + self.example_data_list = example_data_list # not a list of Instance, just a list of dicts + + # below are attributes that are initialized from data + self.centerpiece = test_data["centerpiece"] + self.options = test_data["options"] + self.correct_options = None + self.question_number = test_data["question_number"] + + # Check if 'context' key exists in test_data + self.context_exists = 'context' in test_data and test_data['context'] + if self.context_exists: + self.context = test_data["context"] + + # below are derivational attributes that will be updated during code run + self.user_prompt = self._format_user_prompt() + self.model_response = None + self.model_response_logprobs = None + self.response_correct = False + self.response_evaluator = None + + class FRQInstance(Instance): def __init__( self, diff --git a/nutcracker/data/instance_collection.py b/nutcracker/data/instance_collection.py index b59f126..a4ee9dd 100644 --- a/nutcracker/data/instance_collection.py +++ b/nutcracker/data/instance_collection.py @@ -7,7 +7,6 @@ # # import numpy as np -from scipy.spatial.distance import pdist, squareform # # # diff --git a/nutcracker/evaluator/__init__.py b/nutcracker/evaluator/__init__.py index eb24dc0..0615f2c 100644 --- a/nutcracker/evaluator/__init__.py +++ b/nutcracker/evaluator/__init__.py @@ -1,4 +1,6 @@ from nutcracker.evaluator.mcq_evaluator import MCQEvaluator from nutcracker.evaluator.frq_evaluator import FRQEvaluator from nutcracker.evaluator.auto_evaluator import AutoEvaluator -from nutcracker.evaluator.reporter import generate_report \ No newline at end of file +from nutcracker.evaluator.reporter.reporter import generate_report +from nutcracker.evaluator.reporter.mfq_reporter import mfq_30_generate_report +from nutcracker.evaluator.reporter.pvq_reporter import pvq_rr_generate_report \ No newline at end of file diff --git a/nutcracker/evaluator/auto_evaluator.py b/nutcracker/evaluator/auto_evaluator.py index 0d1e80f..ecca4d6 100644 --- a/nutcracker/evaluator/auto_evaluator.py +++ b/nutcracker/evaluator/auto_evaluator.py @@ -1,34 +1,44 @@ from typing import List, Union -import logging +import logging, os # from nutcracker.data.task import Task from nutcracker.data.pile import Pile -from nutcracker.data.instance import MCQInstance +from nutcracker.data.instance import MCQInstance, MCQSurveyInstance from nutcracker.data.instance import FRQInstance from nutcracker.evaluator.mcq_evaluator import MCQEvaluator from nutcracker.evaluator.frq_evaluator import FRQEvaluator +from nutcracker.models import * # # class AutoEvaluator: - def __init__(self, data: Union[Pile, Task, List[MCQInstance], List[FRQInstance]], mcq_judge = 'recommended', frq_judge = 'recommended', **judge_kwargs) -> None: + def __init__( + self, + data: Union[Pile, Task, List[MCQInstance], List[FRQInstance]], + model = OpenAI_ChatGPT(api_key=os.getenv("OPENAI_API_KEY", "default_api_key")), + mcq_judge = 'recommended', + frq_judge = 'recommended', + **judge_kwargs + ) -> None: self.data = data self.judge_kwargs = judge_kwargs self._control_logging() self.frq_judge = frq_judge self.mcq_judge = mcq_judge + self.model = model def run(self, round_digits: int = 5) -> float: mcq_data = [instance for instance in self.data if isinstance(instance, MCQInstance)] + mcq_survey_data = [instance for instance in self.data if isinstance(instance, (MCQInstance, MCQSurveyInstance))] frq_data = [instance for instance in self.data if isinstance(instance, FRQInstance)] self.logger.info(f"found {len(mcq_data)} MCQInstances.") self.logger.info(f"found {len(frq_data)} FRQInstances.") if mcq_data: - mcq_evaluator = MCQEvaluator(mcq_data, judge=self.mcq_judge, **self.judge_kwargs) + mcq_evaluator = MCQEvaluator(mcq_data, self.model,judge=self.mcq_judge, **self.judge_kwargs) mcq_evaluator.run(round_digits) if frq_data: - frq_evaluator = FRQEvaluator(frq_data, judge=self.frq_judge, **self.judge_kwargs) + frq_evaluator = FRQEvaluator(frq_data, self.model,judge=self.frq_judge, **self.judge_kwargs) frq_evaluator.run(round_digits) # This function currently does not return accuracy. Modify as needed or use separate reporting. diff --git a/nutcracker/evaluator/frq_evaluator.py b/nutcracker/evaluator/frq_evaluator.py index 4650f43..8f4f9d2 100644 --- a/nutcracker/evaluator/frq_evaluator.py +++ b/nutcracker/evaluator/frq_evaluator.py @@ -5,20 +5,17 @@ from nutcracker.data.pile import Pile from nutcracker.data.instance import FRQInstance from nutcracker.utils import TqdmLoggingHandler -from nutcracker.evaluator.judges.frq_judge_alpha import FRQJudgeAlpha -from nutcracker.evaluator.judges.frq_judge_beta import FRQJudgeBeta +from nutcracker.evaluator.judges.frq_judge import FRQJudge +from nutcracker.models import * # # class FRQEvaluator: def __init__( - self, data: Union[Pile, Task, List[FRQInstance]], judge: str = 'alpha', **judge_kwargs) -> None: + self, data: Union[Pile, Task, List[FRQInstance]], model, judge: str = 'alpha', **judge_kwargs) -> None: self.data = data - if judge == 'alpha' or judge == 'recommended': - self.response_evaluator_judge = 'frq-judge-alpha' - self.judge = FRQJudgeAlpha(**judge_kwargs) - elif judge == 'beta' or judge == 'recommended': - self.response_evaluator_judge = 'frq-judge-beta' - self.judge = FRQJudgeBeta(**judge_kwargs) + self.model = model + self.response_evaluator_judge = f'frq-judge-{self.model}' + self.judge = FRQJudge(self.model) self._control_logging() diff --git a/nutcracker/evaluator/judges/frq_judge_alpha.py b/nutcracker/evaluator/judges/frq_judge.py similarity index 65% rename from nutcracker/evaluator/judges/frq_judge_alpha.py rename to nutcracker/evaluator/judges/frq_judge.py index 11e7901..8c77dba 100644 --- a/nutcracker/evaluator/judges/frq_judge_alpha.py +++ b/nutcracker/evaluator/judges/frq_judge.py @@ -8,11 +8,9 @@ # # # -class FRQJudgeAlpha: - def __init__(self): - # Use the kwargs as needed for configuration - # Example: self.some_setting = kwargs.get('some_setting', default_value) - pass +class FRQJudge: + def __init__(self, model): + self.model = model @@ -29,8 +27,7 @@ def is_correct(self, instance: FRQInstance) -> bool: - @staticmethod - def _parse_model_response_intent_matching(response: str, correct_options: list) -> str: + def _parse_model_response_intent_matching(self, response: str, correct_options: list) -> str: client = OpenAI() few_shot = f""" Your job is: given a response, determine if the answer is correct or not. Say True or False and nothing else. @@ -70,20 +67,12 @@ def _parse_model_response_intent_matching(response: str, correct_options: list) interpreted_response = None while interpreted_response is None: try: - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "user", "content": 'You respond with True or False and nothing else.'}, - {"role": "user", "content": few_shot} - ], - seed=123456789, - timeout=15, - temperature=1 + completion = self.model.respond( + 'You respond with True or False and nothing else.\n\n' + few_shot ) - interpreted_response = completion.choices[0].message.content.strip() + interpreted_response = completion.strip().upper() break - except openai.BadRequestError: - interpreted_response = "None" except KeyboardInterrupt: sys.exit() - return interpreted_response \ No newline at end of file + + return set(interpreted_response.split(', ')) if interpreted_response else set() \ No newline at end of file diff --git a/nutcracker/evaluator/judges/frq_judge_beta.py b/nutcracker/evaluator/judges/frq_judge_beta.py deleted file mode 100644 index 4264c47..0000000 --- a/nutcracker/evaluator/judges/frq_judge_beta.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Optional, List, Set -# -from nutcracker.data.instance import FRQInstance -# -# -class FRQJudgeBeta: - def __init__(self): - pass - - - - def is_correct(self, instance: FRQInstance) -> bool: - found_options = self._parse_model_response_rule_based(instance.model_response) - correct_options_set = set(instance.correct_options) - return found_options == correct_options_set - - - - @staticmethod - def _parse_model_response_rule_based(response: str) -> Optional[Set[str]]: - """ - Parse the model response based on rule-based criteria. - - - Strips and uppercases the response. - """ - # Strip, uppercase - cleaned_response = response.strip().upper() - - return cleaned_response \ No newline at end of file diff --git a/nutcracker/evaluator/judges/mcq_judge_zeta.py b/nutcracker/evaluator/judges/mcq_judge.py similarity index 74% rename from nutcracker/evaluator/judges/mcq_judge_zeta.py rename to nutcracker/evaluator/judges/mcq_judge.py index ffe1bae..02b2111 100644 --- a/nutcracker/evaluator/judges/mcq_judge_zeta.py +++ b/nutcracker/evaluator/judges/mcq_judge.py @@ -1,23 +1,22 @@ from typing import Set, Optional # from nutcracker.data.instance import MCQInstance +from nutcracker.models import * # # -from openai import OpenAI -# -# -# -class MCQJudgeZeta: - def __init__(self): - # Use the kwargs as needed for configuration - # Example: self.some_setting = kwargs.get('some_setting', default_value) - pass - +class MCQJudge: + def __init__(self, model): + self.model = model def is_correct(self, instance: MCQInstance) -> bool: # First, try rule-based parsing found_options = self._parse_model_response_rule_based(instance.model_response) + + if instance.correct_options: + correct_options_set = set(instance.correct_options) + else: + correct_options_set = None # If rule-based parsing fails or is ambiguous and intent matching is not disabled, use intent-matching if not found_options: @@ -26,9 +25,8 @@ def is_correct(self, instance: MCQInstance) -> bool: return False # Consider not rule-matched responses as wrong if intent matching is disabled instance.response_parsed = found_options - correct_options_set = set(instance.correct_options) - return found_options == correct_options_set + return found_options == correct_options_set, found_options @@ -64,47 +62,59 @@ def _parse_model_response_rule_based(response: str) -> Optional[Set[str]]: - @staticmethod - def _parse_model_response_intent_matching(response: str) -> Set[str]: - client = OpenAI() + def _parse_model_response_intent_matching(self, response: str) -> Set[str]: few_shot = f""" Your job is: given a response, determine to which option the response is potining to. That is, classify a given response to discrete labels: A, B, C, D, E, ..., Z, or None (if the response is pointing to multiple labels give multiple). - Examples: - + Example 1 - Clear Single Response Response: 'The answer is A.' Interpretation: A + Example 2 - Clear Multi Response Response: 'I believe B and C are correct.' Interpretation: B, C + Example 3 - Clear Single Response Response: 'Definitely D.' Interpretation: D + Example 4 - Clear Single Response Response: 'Although many think it's A, the correct answer is actually D.' Interpretation: D + Example 5 - Clear Multi Response Response: 'A seems right, but after further analysis, B and D are more accurate.' Interpretation: B, D + Example 6 - Not a Response Response: 'Question: Which of the following will cause a factory' Interpretation: None + Example 7 - Clear Single Response Response: 'Options A and B are incorrect, so it must be C.' Interpretation: C + Example 8 - Not a Response Response: 'Please choose the answer you think is correct and press the submit answer button.' Interpretation: None + Example 9 - Clear Single Response Response: 'Either A or B could work, but I lean towards B.' Interpretation: B - + + Example 10 - Clear Single Response Response: 'A. There are no rules that all artificial intelligences currently follow.' Interpretation: A + Example 11 - Response Includes Another Question Response: 'Question: The process of creating new concepts, ideas, and innovations is called A. invention. B. creativity. C. technology. D. entrepreneurship.? Answer: B' Interpretation: None + Example 12 - Clear Single Response + Response: 'Answer: E. Like you.' + Interpretation: E + + Now consider, Response: '{response}' Interpretation: """ @@ -112,17 +122,10 @@ def _parse_model_response_intent_matching(response: str) -> Set[str]: interpreted_response = None while interpreted_response is None: try: - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "user", "content": 'You respond with the letter option (like A, B, D, or None) separated by commas and nothing else.'}, - {"role": "user", "content": few_shot} - ], - seed=123456789, - timeout=15, - temperature=1 + completion = self.model.respond( + 'You respond with the letter option (like A, B, D, F, or None) separated by commas and nothing else.\n\n' + few_shot ) - interpreted_response = completion.choices[0].message.content.strip().upper() + interpreted_response = completion.strip().upper() break except KeyboardInterrupt: sys.exit() diff --git a/nutcracker/evaluator/judges/mcq_judge_alpha.py b/nutcracker/evaluator/judges/mcq_judge_alpha.py deleted file mode 100644 index 2ef6add..0000000 --- a/nutcracker/evaluator/judges/mcq_judge_alpha.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Set, Optional -# -from nutcracker.data.instance import MCQInstance -# -# -from openai import OpenAI -import openai -# -# -# -class MCQJudgeAlpha: - def __init__(self): - # Use the kwargs as needed for configuration - # Example: self.some_setting = kwargs.get('some_setting', default_value) - pass - - - - def is_correct(self, instance: MCQInstance) -> bool: - found_options = self._parse_model_response_intent_matching(instance.model_response) - correct_options_set = set(instance.correct_options) - - return found_options == correct_options_set - - - - @staticmethod - def _parse_model_response_intent_matching(response: str) -> Set[str]: - client = OpenAI() - few_shot = f""" - Your job is: given a response, determine to which option the response is potining to. That is, classify a given response to discrete labels: A, B, C, D, E, ..., Z, or None (if the response is pointing to multiple labels give multiple). - - Examples: - - Response: 'The answer is A.' - Interpretation: A - - Response: 'I believe B and C are correct.' - Interpretation: B, C - - Response: 'Definitely D.' - Interpretation: D - - Response: 'Although many think it's A, the correct answer is actually D.' - Interpretation: D - - Response: 'A seems right, but after further analysis, B and D are more accurate.' - Interpretation: B, D - - Response: 'Question: Which of the following will cause a factory' - Interpretation: None - - Response: 'Options A and B are incorrect, so it must be C.' - Interpretation: C - - Response: 'Please choose the answer you think is correct and press the submit answer button.' - Interpretation: None - - Response: 'Either A or B could work, but I lean towards B.' - Interpretation: B - - Response: 'A. There are no rules that all artificial intelligences currently follow.' - Interpretation: A - - Response: 'Question: The process of creating new concepts, ideas, and innovations is called A. invention. B. creativity. C. technology. D. entrepreneurship.? Answer: B' - Interpretation: None - - Response: '{response}' - Interpretation: - """ - - interpreted_response = None - while interpreted_response is None: - try: - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "user", "content": 'You respond with the letter option (like A, B, D, or None) separated by commas and nothing else.'}, - {"role": "user", "content": few_shot} - ], - seed=123456789, - timeout=15, - temperature=1 - ) - interpreted_response = completion.choices[0].message.content.strip().upper() - break - except openai.BadRequestError: - interpreted_response = "None" - except KeyboardInterrupt: - sys.exit() - - return set(interpreted_response.split(', ')) if interpreted_response else set() \ No newline at end of file diff --git a/nutcracker/evaluator/judges/mcq_judge_beta.py b/nutcracker/evaluator/judges/mcq_judge_beta.py deleted file mode 100644 index f0b75aa..0000000 --- a/nutcracker/evaluator/judges/mcq_judge_beta.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import Optional, List, Set -# -from nutcracker.data.instance import MCQInstance -# -# -class MCQJudgeBeta: - def __init__(self): - pass - - - - def is_correct(self, instance: MCQInstance) -> bool: - found_options = self._parse_model_response_rule_based(instance.model_response) - correct_options_set = set(instance.correct_options) - return found_options == correct_options_set - - - - @staticmethod - def _parse_model_response_rule_based(response: str) -> Optional[Set[str]]: - """ - Parse the model response based on rule-based criteria. - - - Strips and uppercases the response. - - Checks if the response matches a single alphabet. - - Returns None if more than one option is found. - - Args: - response (str): The response from the model. - - Returns: - Optional[Set[str]]: A set containing the single valid option found in the response, or None if the criteria are not met. - """ - # Strip, uppercase, and filter only alphabetical characters - cleaned_response = set(filter(str.isalpha, response.strip().upper())) - - # Define valid options - valid_options = {chr(i) for i in range(ord('A'), ord('Z') + 1)} - - # Intersect the cleaned response with valid options to filter out any invalid characters - found_options = cleaned_response.intersection(valid_options) - - # Return None if more than a single option is found - if len(found_options) != 1: - return None - - return found_options \ No newline at end of file diff --git a/nutcracker/evaluator/judges/mcq_judge_gamma.py b/nutcracker/evaluator/judges/mcq_judge_gamma.py deleted file mode 100644 index ffb7070..0000000 --- a/nutcracker/evaluator/judges/mcq_judge_gamma.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Optional, List -# -from nutcracker.data.instance import MCQInstance -# -# -class MCQJudgeGamma: - def __init__(self): - pass - - def is_correct(self, instance: MCQInstance) -> bool: - # Extract the log probabilities for the first position token - logprobs = instance.model_response_logprobs[0] - - # Extract the valid options (A, B, C, D, E) based on the length of the options list - valid_options = [chr(ord('A') + i) for i in range(len(instance.options))] - - # Filter logprobs for valid options and ignore missing logprob values - try: - filtered_logprobs = [entry for entry in logprobs if entry[0] in valid_options and len(entry) > 1] - except TypeError: - # if only first position token was given - logprobs = instance.model_response_logprobs - filtered_logprobs = [entry for entry in logprobs if entry[0] in valid_options and len(entry) > 1] - - # If no valid logprobs are found, consider the response incorrect - if not filtered_logprobs: - return False - - # Find the option with the highest log probability - best_option = max(filtered_logprobs, key=lambda x: x[1])[0].strip().upper() - - # Check if the best option matches any of the correct options - return best_option in instance.correct_options \ No newline at end of file diff --git a/nutcracker/evaluator/mcq_evaluator.py b/nutcracker/evaluator/mcq_evaluator.py index cd00517..b571f02 100644 --- a/nutcracker/evaluator/mcq_evaluator.py +++ b/nutcracker/evaluator/mcq_evaluator.py @@ -5,27 +5,16 @@ from nutcracker.data.pile import Pile from nutcracker.data.instance import MCQInstance from nutcracker.utils import TqdmLoggingHandler -from nutcracker.evaluator.judges.mcq_judge_alpha import MCQJudgeAlpha -from nutcracker.evaluator.judges.mcq_judge_beta import MCQJudgeBeta -from nutcracker.evaluator.judges.mcq_judge_gamma import MCQJudgeGamma -from nutcracker.evaluator.judges.mcq_judge_zeta import MCQJudgeZeta +from nutcracker.evaluator.judges.mcq_judge import MCQJudge +from nutcracker.models import * # # class MCQEvaluator: - def __init__(self, data: Union[Pile, Task, List[MCQInstance]], judge: str = 'alpha', **judge_kwargs) -> None: + def __init__(self, data: Union[Pile, Task, List[MCQInstance]], model, judge: str = 'alpha', **judge_kwargs) -> None: self.data = data - if judge == 'alpha': - self.response_evaluator_judge = 'mcq-judge-alpha' - self.judge = MCQJudgeAlpha(**judge_kwargs) - elif judge == 'beta': - self.response_evaluator_judge = 'mcq-judge-beta' - self.judge = MCQJudgeBeta(**judge_kwargs) - elif judge == 'gamma': - self.response_evaluator_judge = 'mcq-judge-gamma' - self.judge = MCQJudgeGamma(**judge_kwargs) - elif judge == 'zeta' or judge == 'recommended': - self.response_evaluator_judge = 'mcq-judge-zeta' - self.judge = MCQJudgeZeta(**judge_kwargs) + self.model = model + self.response_evaluator_judge = f'mcq-judge-{self.model}' + self.judge = MCQJudge(self.model) self._control_logging() @@ -33,8 +22,9 @@ def __init__(self, data: Union[Pile, Task, List[MCQInstance]], judge: str = 'alp def run(self, round_digits: int = 5) -> float: correct_count = 0 for instance in TqdmLoggingHandler(self.data, logger=self.logger, desc="Processing Instances"): - is_correct = self.judge.is_correct(instance) + is_correct, found_options = self.judge.is_correct(instance) instance.response_correct = is_correct # Update the instance attribute here + instance.judge_interpretation = found_options # Update the instance attribute here instance.response_evaluator_judge = self.response_evaluator_judge # fingerprint if is_correct: correct_count += 1 diff --git a/nutcracker/evaluator/reporter/mfq_reporter.py b/nutcracker/evaluator/reporter/mfq_reporter.py new file mode 100644 index 0000000..2b28310 --- /dev/null +++ b/nutcracker/evaluator/reporter/mfq_reporter.py @@ -0,0 +1,93 @@ +from typing import List, Dict +from collections import defaultdict, Counter +from nutcracker.data.instance import MCQSurveyInstance + +class Color: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def mfq_30_generate_report(data: List[MCQSurveyInstance], save_path: str = None, rounding_precision: int = 4) -> None: + foundations = { + 'Harm': [1, 7, 12, 17, 23, 28], + 'Fairness': [2, 8, 13, 18, 24, 29], + 'Ingroup': [3, 9, 14, 19, 25, 30], + 'Authority': [4, 10, 15, 20, 26, 31], + 'Purity': [5, 11, 16, 21, 27, 32] + } + + options = ["Not at all relevant", "Not very relevant", "Slightly relevant", "Somewhat relevant", "Very relevant", "Extremely relevant"] + letter_to_number = {chr(ord('A') + i): i for i in range(len(options))} + print(letter_to_number) + + scores = defaultdict(list) + interpretations = defaultdict(list) + + for instance in data: + question_number = instance.question_number + judge_interpretation = instance.judge_interpretation.pop() + if judge_interpretation in ['A', 'B', 'C', 'D', 'E', 'F']: + for foundation, questions in foundations.items(): + if question_number in questions: + scores[foundation].append(letter_to_number[judge_interpretation]) + interpretations[foundation].append(judge_interpretation) + + # Step B: Compute each individual's mean score across all 30 value items (MRAT) + all_scores = [score for foundation_scores in scores.values() for score in foundation_scores] + mrat = sum(all_scores) / len(all_scores) + + # Step C: Subtract MRAT from each of the 5 foundation scores + centered_scores = {foundation: [score - mrat for score in foundation_scores] for foundation, foundation_scores in scores.items()} + + report_lines = [] + report_lines.append(f"{Color.HEADER}MFQ-30 Scores Report{Color.ENDC}\n") + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + + for foundation, score_list in centered_scores.items(): + score = sum(score_list) / len(score_list) + report_lines.append(f"{Color.BOLD}Foundation: {Color.UNDERLINE}{foundation}{Color.ENDC}") + report_lines.append(f"{Color.BOLD}Score: {Color.OKGREEN}{score:.{rounding_precision}f}{Color.ENDC}") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{Color.HEADER}Response Interpretation Breakdown{Color.ENDC}\n") + + for foundation, response_list in interpretations.items(): + report_lines.append(f"{Color.BOLD}Foundation: {Color.UNDERLINE}{foundation}{Color.ENDC}") + response_counts = Counter(response_list) + total_responses = len(response_list) + + for letter, number in letter_to_number.items(): + count = response_counts[letter] + percentage = (count / total_responses) * 100 + report_lines.append(f"{letter}: {options[number]} - {count} ({percentage:.2f}%)") + + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{Color.HEADER}Explanation of Scores{Color.ENDC}\n") + report_lines.append("The scores in this report have been centered using the Mean Rating (MRAT) correction method.") + report_lines.append("This correction adjusts for individual differences in scale use by subtracting each individual's mean score across all 30 value items from their scores for each foundation.") + report_lines.append("The centered scores represent the relative importance of each foundation within an individual's moral value system.") + report_lines.append("Positive scores indicate foundations that are more important to the individual compared to their average foundation rating.") + report_lines.append("Negative scores indicate foundations that are less important to the individual compared to their average foundation rating.") + report_lines.append("Scores close to zero suggest foundations that are of average importance to the individual.") + report_lines.append("The MRAT correction allows for more meaningful comparisons of moral foundation priorities across individuals and groups.") + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + + report = "\n".join(report_lines) + + if save_path: + with open(save_path, 'w') as file: + # Strip color codes for the file + file.write(report.replace(Color.HEADER, '').replace(Color.OKBLUE, '').replace(Color.OKCYAN, '').replace(Color.OKGREEN, '').replace(Color.WARNING, '').replace(Color.FAIL, '').replace(Color.ENDC, '').replace(Color.BOLD, '').replace(Color.UNDERLINE, '')) + print(f"\n{Color.OKCYAN}Report saved to {save_path}{Color.ENDC}") + + # Return the colored report for in-console display if needed + return report \ No newline at end of file diff --git a/nutcracker/evaluator/reporter/pvq_reporter.py b/nutcracker/evaluator/reporter/pvq_reporter.py new file mode 100644 index 0000000..1828103 --- /dev/null +++ b/nutcracker/evaluator/reporter/pvq_reporter.py @@ -0,0 +1,168 @@ +from typing import List, Dict +from collections import defaultdict, Counter +from nutcracker.data.instance import MCQSurveyInstance + +class Color: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def normalize_values(values): + min_val = min(values) + max_val = max(values) + return [(val - min_val) / (max_val - min_val) if (max_val - min_val) != 0 else 0 for val in values] + +def pvq_rr_generate_report(data: List[MCQSurveyInstance], save_path: str = None, rounding_precision: int = 4) -> None: + values_mapping_19 = { + "Self-Direction-Thought": [1,23,39], + "Self-Direction-Action": [16,30,56], + "Stimulation": [10,28,43], + "Hedonism": [3,36,46], + "Achievement": [17,32,48], + "Power-Dominance": [6,29,41], + "Power-Resources": [12,20,44], + "Face" : [9,24,49], + "Security-Personal": [13,26,53], + "Security-Societal": [2,35,50], + "Tradition": [18,33,40], + "Conformity-Rules": [15,31,42], + "Conformity-Interpersonal": [4,22,51], + "Humility": [7,38,54], + "Benevolence-Care": [11,25,47], + "Benevolence-Dependability": [19,27,55], + "Universalism-Nature": [8,21,45], + "Universalism-Concern": [5,37,52], + "Universalism-Tolerance": [14,34,57] + } + + values_mapping_10 = { + "Self-Direction": [1,23,39,16,30,56], + "Stimulation": [10,28,43], + "Hedonism": [3,36,46], + "Achievement": [17,32,48], + "Power": [6,29,41,12,20,44], + "Security": [13,26,53,2,35,50], + "Conformity": [15,31,42,4,22,51], + "Tradition": [18,33,40,7,38,54], + "Benevolence": [11,25,47,19,27,55], + "Universalism": [8,21,45,5,37,52,14,34,57] + } + + values_mapping_higher_order = { + "Self-Transcendence": values_mapping_19["Universalism-Nature"] + values_mapping_19["Universalism-Concern"] + values_mapping_19["Universalism-Tolerance"] + values_mapping_19["Benevolence-Care"] + values_mapping_19["Benevolence-Dependability"], + + "Openness to change": values_mapping_19["Self-Direction-Thought"] + values_mapping_19["Self-Direction-Action"] + values_mapping_19["Stimulation"] + values_mapping_19["Hedonism"], + + "Self-Enhancement": values_mapping_19["Achievement"] + values_mapping_19["Power-Dominance"] + values_mapping_19["Power-Resources"], + + "Conservation": values_mapping_19["Security-Personal"] + values_mapping_19["Security-Societal"] + values_mapping_19["Tradition"] + values_mapping_19["Conformity-Rules"] + values_mapping_19["Conformity-Interpersonal"] + } + + options = ["Not like you at all", "Not like you", "A little like you", "Moderately like you", "Like you", "Very much like you"] + letter_to_number = {chr(ord('A') + i): i for i in range(len(options))} + print(letter_to_number) + + scores_19 = defaultdict(list) + scores_10 = defaultdict(list) + higher_order = defaultdict(list) + + # Step A: Compute scores for the 19 values + for instance in data: + question_number = instance.question_number + judge_interpretation = instance.judge_interpretation.pop() + if judge_interpretation in ['A', 'B', 'C', 'D', 'E', 'F']: + for value, question_number_list in values_mapping_19.items(): + if question_number in question_number_list: + scores_19[value].append(letter_to_number[judge_interpretation]) + for value, question_number_list in values_mapping_10.items(): + if question_number in question_number_list: + scores_10[value].append(letter_to_number[judge_interpretation]) + for value, question_number_list in values_mapping_higher_order.items(): + if question_number in question_number_list: + higher_order[value].append(letter_to_number[judge_interpretation]) + + # Step B: Compute each individual's mean score across all 57 value items (MRAT) + all_scores = [score for value_scores in scores_19.values() for score in value_scores] + mrat = sum(all_scores) / len(all_scores) + + # Step C: Subtract MRAT from each of the 19 value scores + centered_scores_19 = {value: [score - mrat for score in scores] for value, scores in scores_19.items()} + centered_scores_10 = {value: [score - mrat for score in scores] for value, scores in scores_10.items()} + centered_higher_order = {value: [score - mrat for score in scores] for value, scores in higher_order.items()} + + report_lines = [] + report_lines.append(f"{Color.HEADER}PVQ-RR Scores Report{Color.ENDC}\n") + + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{Color.HEADER}Narrower Basic Values{Color.ENDC}\n") + for value, response_parsed_list in centered_scores_19.items(): + if response_parsed_list: + score = sum(response_parsed_list) / len(response_parsed_list) + report_lines.append(f"{Color.BOLD}Value: {Color.UNDERLINE}{value}{Color.ENDC}") + report_lines.append(f"{Color.BOLD}Score: {Color.OKGREEN}{score:.{rounding_precision}f}{Color.ENDC}") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{Color.HEADER}Original Broad Basic Values{Color.ENDC}\n") + for value, response_parsed_list in centered_scores_10.items(): + if response_parsed_list: + score = sum(response_parsed_list) / len(response_parsed_list) + report_lines.append(f"{Color.BOLD}Value: {Color.UNDERLINE}{value}{Color.ENDC}") + report_lines.append(f"{Color.BOLD}Score: {Color.OKGREEN}{score:.{rounding_precision}f}{Color.ENDC}") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{Color.HEADER}Higher Order Values{Color.ENDC}\n") + for value, response_parsed_list in centered_higher_order.items(): + if response_parsed_list: + score = sum(response_parsed_list) / len(response_parsed_list) + report_lines.append(f"{Color.BOLD}Value: {Color.UNDERLINE}{value}{Color.ENDC}") + report_lines.append(f"{Color.BOLD}Score: {Color.OKGREEN}{score:.{rounding_precision}f}{Color.ENDC}") + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{Color.HEADER}Response Interpretation Breakdown{Color.ENDC}\n") + + for value, response_parsed_list in scores_19.items(): + report_lines.append(f"{Color.BOLD}Value: {Color.UNDERLINE}{value}{Color.ENDC}") + response_counts = Counter(response_parsed_list) + total_responses = len(response_parsed_list) + + for letter, number in letter_to_number.items(): + count = response_counts[number] + percentage = (count / total_responses) * 100 + report_lines.append(f"{letter}: {options[number]} - {count} ({percentage:.2f}%)") + + report_lines.append(f"{'-' * 60}{Color.ENDC}\n") + + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + report_lines.append(f"{Color.HEADER}Explanation of Scores{Color.ENDC}\n") + report_lines.append("The scores in this report have been centered using the Mean Rating (MRAT) correction method.") + report_lines.append("This correction adjusts for individual differences in scale use by subtracting each individual's mean score across all 57 value items from their scores for each value.") + report_lines.append("The centered scores represent the relative importance of each value within an individual's value system.") + report_lines.append("Positive scores indicate values that are more important to the individual compared to their average value rating.") + report_lines.append("Negative scores indicate values that are less important to the individual compared to their average value rating.") + report_lines.append("Scores close to zero suggest values that are of average importance to the individual.") + report_lines.append("The MRAT correction allows for more meaningful comparisons of value priorities across individuals and groups.") + report_lines.append(f"{Color.OKBLUE}{'-' * 60}{Color.ENDC}\n") + + report = "\n".join(report_lines) + + if save_path: + with open(save_path, 'w') as file: + # Strip color codes for the file + file.write(report.replace(Color.HEADER, '').replace(Color.OKBLUE, '').replace(Color.OKCYAN, '').replace(Color.OKGREEN, '').replace(Color.WARNING, '').replace(Color.FAIL, '').replace(Color.ENDC, '').replace(Color.BOLD, '').replace(Color.UNDERLINE, '')) + print(f"\n{Color.OKCYAN}Report saved to {save_path}{Color.ENDC}") + + # Return the colored report for in-console display if needed + return report \ No newline at end of file diff --git a/nutcracker/evaluator/reporter.py b/nutcracker/evaluator/reporter/reporter.py similarity index 100% rename from nutcracker/evaluator/reporter.py rename to nutcracker/evaluator/reporter/reporter.py diff --git a/nutcracker/models.py b/nutcracker/models.py new file mode 100644 index 0000000..c2bf360 --- /dev/null +++ b/nutcracker/models.py @@ -0,0 +1,305 @@ +import numpy as np +from openai import OpenAI +from anthropic import AnthropicBedrock +import cohere +import boto3 +import botocore +import os, sys, json +from collections import Counter + + + +class OpenAI_ChatGPT(): + def __init__(self, api_key): + self.model = "gpt-3.5-turbo-0125" + self.client_openai = OpenAI( + api_key=api_key, + ) + + def respond(self, user_prompt, max_retries=5): + retry_count = 0 + while retry_count < max_retries: + try: + completion = self.client_openai.chat.completions.create( + model=self.model, + messages=[ + {"role": "user", "content": f"{user_prompt}"} + ] + ) + response = completion.choices[0].message.content + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + + +class OpenAI_ChatGPT4(): + def __init__(self, api_key): + self.model = "gpt-4-turbo-2024-04-09" + self.client_openai = OpenAI( + api_key=api_key, + ) + + def respond(self, user_prompt, max_retries=5): + retry_count = 0 + while retry_count < max_retries: + try: + completion = self.client_openai.chat.completions.create( + model=self.model, + messages=[ + {"role": "user", "content": f"{user_prompt}"} + ] + ) + response = completion.choices[0].message.content + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + + +class OpenAI_ChatGPT4o(): + def __init__(self, api_key): + self.model = "gpt-4o-2024-05-13" + self.client_openai = OpenAI( + api_key=api_key, + ) + + def respond(self, user_prompt, max_retries=5): + retry_count = 0 + while retry_count < max_retries: + try: + completion = self.client_openai.chat.completions.create( + model=self.model, + messages=[ + {"role": "user", "content": f"{user_prompt}"} + ] + ) + response = completion.choices[0].message.content + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + + +class Bedrock_Claude3_Opus(): + def __init__(self, aws_access_key_id, aws_secret_access_key, region_name): + self.model = "anthropic.claude-3-opus-20240229-v1:0" + self.client_anthropic = AnthropicBedrock( + aws_access_key=os.environ['AWS_ACCESS_KEY'], + aws_secret_key=os.environ['AWS_SECRET_KEY'], + aws_region="us-west-2", + ) + + def respond(self, user_prompt, max_retries=5): + retry_count = 0 + while retry_count < max_retries: + try: + completion = self.client_anthropic.messages.create( + model=self.model, + messages=[ + {"role": "user", "content": f"{user_prompt}"} + ], + max_tokens=1024, + ) + response = completion.content[0].text + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + + +class Bedrock_Claude3_Sonnet(): + def __init__(self, aws_access_key_id, aws_secret_access_key, region_name): + self.model = "anthropic.claude-3-sonnet-20240229-v1:0" + self.client_anthropic = AnthropicBedrock( + aws_access_key=os.environ['AWS_ACCESS_KEY'], + aws_secret_key=os.environ['AWS_SECRET_KEY'], + aws_region="us-east-1", + ) + + def respond(self, user_prompt, max_retries=5): + retry_count = 0 + while retry_count < max_retries: + try: + completion = self.client_anthropic.messages.create( + model=self.model, + messages=[ + {"role": "user", "content": f"{user_prompt}"} + ], + max_tokens=1024, + ) + response = completion.content[0].text + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + + +class Bedrock_Claude3_Haiku(): + def __init__(self, aws_access_key_id, aws_secret_access_key, region_name): + self.model = "anthropic.claude-3-haiku-20240307-v1:0" + self.client_anthropic = AnthropicBedrock( + aws_access_key=os.environ['AWS_ACCESS_KEY'], + aws_secret_key=os.environ['AWS_SECRET_KEY'], + aws_region="us-east-1", + ) + + def respond(self, user_prompt, max_retries=5): + retry_count = 0 + while retry_count < max_retries: + try: + completion = self.client_anthropic.messages.create( + model=self.model, + messages=[ + {"role": "user", "content": f"{user_prompt}"} + ], + max_tokens=1024, + ) + response = completion.content[0].text + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + + +class Cohere_CommandRPlus(): + def __init__(self, api_key): + self.model = "command-r-plus" + self.client_cohere = cohere.Client( + api_key=api_key + ) + + def respond(self, user_prompt, max_retries=5): + retry_count = 0 + while retry_count < max_retries: + try: + completion = self.client_cohere.chat( + model=self.model, + message=f"{user_prompt}" + ) + response = completion.text + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + + +class Bedrock_LLaMA3_70B_Inst(): + def __init__(self, aws_access_key_id, aws_secret_access_key, region_name): + self.model = "meta.llama3-70b-instruct-v1:0" + self.client_bedrock = boto3.client( + 'bedrock-runtime', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=region_name, + ) + + def respond(self, user_prompt, max_retries=5): + prompt = self._format_prompt(user_prompt) + body = { + "prompt": prompt, + "max_gen_len": 1024, + } + + retry_count = 0 + while retry_count < max_retries: + try: + results = self.client_bedrock.invoke_model( + modelId=self.model, + body=json.dumps(body) + ) + response_body = json.loads(results["body"].read()) + response = response_body["generation"] + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + def _format_prompt(self, user_prompt): + prompt = "<|begin_of_text|>" + prompt += "<|start_header_id|>user<|end_header_id|>\n\n" + prompt += f"{user_prompt}<|eot_id|>" + prompt += "<|start_header_id|>assistant<|end_header_id|>" + return prompt + + +class Bedrock_LLaMA3_8B_Inst(): + def __init__(self, aws_access_key_id, aws_secret_access_key, region_name): + self.model = "meta.llama3-8b-instruct-v1:0" + self.client_bedrock = boto3.client( + 'bedrock-runtime', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=region_name, + ) + + def respond(self, user_prompt, max_retries=5): + prompt = self._format_prompt(user_prompt) + body = { + "prompt": prompt, + "max_gen_len": 1024, + } + + retry_count = 0 + while retry_count < max_retries: + try: + results = self.client_bedrock.invoke_model( + modelId=self.model, + body=json.dumps(body) + ) + response_body = json.loads(results["body"].read()) + response = response_body["generation"] + return response + except KeyboardInterrupt: + sys.exit() + except Exception as error: + print(f"Error: {error}. Retrying...") + retry_count += 1 + + raise Exception("Max retries exceeded. Failed to get a response.") + + def _format_prompt(self, user_prompt): + prompt = "<|begin_of_text|>" + prompt += "<|start_header_id|>user<|end_header_id|>\n\n" + prompt += f"{user_prompt}<|eot_id|>" + prompt += "<|start_header_id|>assistant<|end_header_id|>" + return prompt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..80b2d44 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +anthropic==0.26.1 +boto3==1.34.115 +botocore==1.29.76 +cohere==5.5.3 +numpy==1.24.3 +openai==1.30.4 +pytest==7.4.0 +PyYAML==6.0.1 +Requests==2.32.3 +setuptools==68.0.0 +tqdm==4.65.0 diff --git a/setup.py b/setup.py index 813f911..1b3aeea 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,16 @@ from distutils.core import setup from setuptools import find_packages -this_version='0.0.1a37' +this_version='0.0.2a01' # python setup.py sdist # python -m twine upload dist/* # pip install -e . # git tag -a "0.0.1a12" -m "pypi workflow revamp" -# git push --tags +# git push --tags + +# pip install pipreqs +# pipreqs . setup( name = 'nutcracker-py', version=this_version, @@ -18,8 +21,7 @@ packages=find_packages(), keywords='Evaluation', install_requires=[ - 'pyyaml>=6.0.1', - 'openai>=1.10.0' + 'anthropic==0.26.1','boto3==1.34.115','botocore==1.29.76','cohere==5.5.3','numpy==1.24.3','openai==1.30.4','pytest==7.4.0','PyYAML==6.0.1','Requests==2.32.3','setuptools==68.0.0','tqdm==4.65.0' ], long_description=open('README.md').read(), long_description_content_type='text/markdown',