Skip to content

Commit

Permalink
fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
kritinv committed Aug 3, 2024
1 parent 99cd4f1 commit d88d8b3
Show file tree
Hide file tree
Showing 12 changed files with 213 additions and 363 deletions.
19 changes: 10 additions & 9 deletions deepeval/benchmarks/big_bench_hard/big_bench_hard.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from deepeval.benchmarks.big_bench_hard.template import BigBenchHardTemplate
from deepeval.benchmarks.utils import should_use_batch
from deepeval.scorer import Scorer
from deepeval.benchmarks.models import bbh_models_dict, bbh_confinement_statements_dict
from deepeval.benchmarks.models import *

class BigBenchHard(DeepEvalBaseBenchmark):
def __init__(
Expand Down Expand Up @@ -106,27 +106,24 @@ def predict(
self, model: DeepEvalBaseLLM, task: BigBenchHardTask, golden: Golden
) -> Dict:
# Define prompt template
prompt: dict = BigBenchHardTemplate.generate_output(
prompt: str = BigBenchHardTemplate.generate_output(
input=golden.input,
task=task,
n_shots=self.n_shots,
enable_cot=self.enable_cot,
)
model = bbh_models_dict[task.value]
pydantic_model = bbh_models_dict[task.value]
try:
res: model = model.generate(
prompt=prompt, schema=model
prompt=prompt, schema=pydantic_model
)
prediction = res.answer
prediction = str(res.answer)
except TypeError:
prompt += bbh_confinement_statements_dict[task.value]
prediction = model.generate(prompt)
prediction = str(model.generate(prompt))

if isinstance(prediction, tuple):
prediction = prediction[0]

# WARNING: doesn't work. Should use regex to isolate true and false instead
prediction = prediction[:-1] if self.enable_cot else prediction

# Define Metric
score = self.scorer.exact_match_score(
Expand Down Expand Up @@ -179,8 +176,12 @@ def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]:
dataset = load_dataset("lukaemon/bbh", task.value)

goldens: List[Golden] = []
count = 0
for data in dataset["test"]:
golden = Golden(input=data["input"], expected_output=data["target"])
goldens.append(golden)
count += 1
if count > 10:
break

return goldens
25 changes: 13 additions & 12 deletions deepeval/benchmarks/drop/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from deepeval.benchmarks.drop.template import DROPTemplate
from deepeval.benchmarks.utils import should_use_batch
from deepeval.scorer import Scorer
from deepeval.benchmarks.models import NumberModel, StringModel, DateModel
from deepeval.benchmarks.models import DROPDateModel, DROPNumberModel, DROPStringModel

DELIMITER = ","

Expand Down Expand Up @@ -106,29 +106,30 @@ def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
)

# Enforced model generation
type_info = golden["context"][0]
type_info = golden.context[0]
try:
if type_info == "number":
schema = NumberModel
schema = DROPNumberModel
elif type_info == "date":
schema = DateModel
schema = DROPDateModel
elif type_info == "span":
schema = StringModel
res: Union[NumberModel, DateModel, StringModel] = model.generate(
schema = DROPStringModel
res: Union[DROPNumberModel, DROPDateModel, DROPStringModel] = model.generate(
prompt=prompt, schema=schema
)
prediction = res.answer
prediction = str(res.answer)
except TypeError:
prompt += "Output should be of type {type}. No explanation needed.".format(type=type)
prediction = model.generate(prompt)
prediction = str(model.generate(prompt))

# For native models, shouldn't happen but just in case
if isinstance(prediction, tuple):
prediction = prediction[0]

# Define Metric
score = self.scorer.exact_match_score(
golden.expected_output, prediction
expected_output = DROPTemplate.parse_str_to_list(golden.expected_output, DELIMITER)
score = self.scorer.quasi_contains_score(
expected_output, prediction
)
return {"prediction": prediction, "score": score}

Expand Down Expand Up @@ -161,7 +162,7 @@ def batch_predict(
prediction = predictions[i]
golden = goldens[i]
# Define Metric
score = self.scorer.exact_match_score(
score = self.scorer.quasi_exact_match_score(
golden.expected_output, prediction
)
res.append({"prediction": prediction, "score": score})
Expand Down Expand Up @@ -196,7 +197,7 @@ def load_benchmark_dataset(self, task: DROPTask) -> List[Golden]:
goldens: List[Golden] = []
for data in val_set:
input = DROPTemplate.format_question(data, include_answer=False)
output = DELIMITER.join(tuple(data["answers_spans"]["spans"][0]))
output = DROPTemplate.parse_list_to_str(data["answers_spans"]["spans"], DELIMITER)
output_type = data["answers_spans"]["types"][0]
golden = Golden(
input=input, expected_output=output, context=[output_type]
Expand Down
13 changes: 13 additions & 0 deletions deepeval/benchmarks/drop/template.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

class DROPTemplate:

# Most of this template was taken from MMLU Github Repo
Expand All @@ -24,3 +26,14 @@ def format_question(data: dict, include_answer: bool = False):
if include_answer:
prompt += data["answers_spans"]["spans"][0] + "\n"
return prompt

@staticmethod
def parse_list_to_str(input_list: List, DELIMITER: str) -> str:
if len(input_list) == 1:
return input_list[0]
else:
return DELIMITER.join(tuple(input_list))

@staticmethod
def parse_str_to_list(input_str: str, DELIMITER: str) -> List[str]:
return input_str.split(DELIMITER)
10 changes: 7 additions & 3 deletions deepeval/benchmarks/gsm8k/gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
if score:
overall_correct_predictions += 1
predictions_row.append((golden.input, prediction, score))

# Calculate overall accuracy
overall_accuracy = (
overall_correct_predictions / overall_total_predictions
Expand Down Expand Up @@ -75,15 +75,19 @@ def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
res: NumberModel = model.generate(
prompt=prompt, schema=NumberModel
)
prediction = res.answer
prediction = str(res.answer)
except TypeError:
prompt += "Make sure to output only the numerical answer."
prediction = model.generate(prompt)
prediction = str(model.generate(prompt))

# Define Metric
print(golden.expected_output)
print(prediction)

score = self.scorer.exact_match_score(
golden.expected_output, prediction
)

return {"prediction": prediction, "score": score}

def load_benchmark_dataset(self) -> List[Golden]:
Expand Down
2 changes: 1 addition & 1 deletion deepeval/benchmarks/mmlu/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,4 @@ def load_benchmark_dataset(self, task: MMLUTask) -> List[Golden]:
input = MMLUTemplate.format_question(data, include_answer=False)
golden = Golden(input=input, expected_output=data["target"])
goldens.append(golden)
return goldens
return goldens
23 changes: 17 additions & 6 deletions deepeval/benchmarks/models.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
from pydantic import BaseModel
from typing import List, Literal
from typing import List, Literal, Union

class MultipleChoiceModel(BaseModel):
answer: Literal["A", "B", "C", "D"]

class ListOfNumbersModel(BaseModel):
answer: List[int]

class ListofStringsModel(BaseModel):
answer: List[str]

class NumberModel(BaseModel):
answer: int

class DateModel(BaseModel):
class StringModel(BaseModel):
answer: str

class StringModel(BaseModel):
# DROP Models #############################

class DROPStringModel(BaseModel):
answer: str

class DROPNumberModel(BaseModel):
answer: int

class DROPDateModel(BaseModel):
answer: str

# BBH Models #############################
Expand Down Expand Up @@ -55,19 +66,19 @@ class BBHMultipleChoice18(BaseModel):
answer: Literal["(A)", "(B)", "(C)", "(D)", "(E)", "(F)", "(G)", "(H)", "(I)", "(J)", "(K)", "(L)", "(M)", "(N)", "(O)", "(P)", "(Q)", "(R)"]

bbh_models_dict = {
"boolean_expression": BooleanModel,
"boolean_expressions": BooleanModel,
"causal_judgement": AffirmationModel,
"date_understanding": BBHMultipleChoice6,
"disambiguation_qa": BBHMultipleChoice3,
"dyck_language": StringModel,
"dyck_languages": StringModel,
"formal_fallacies": ValidModel,
"geometric_shapes": BBHMultipleChoice11,
"hyperbaton": BBHMultipleChoice2,
"logical_deduction_three_objects": BBHMultipleChoice3,
"logical_deduction_five_objects": BBHMultipleChoice5,
"logical_deduction_seven_objects": BBHMultipleChoice7,
"movie_recommendation": BBHMultipleChoice5,
"multi_step_arithmetic": NumberModel,
"multistep_arithmetic_two": NumberModel,
"navigate": AffirmationModel,
"object_counting": NumberModel,
"penguins_in_a_table": BBHMultipleChoice5,
Expand Down
22 changes: 14 additions & 8 deletions deepeval/benchmarks/truthful_qa/truthful_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,25 @@ def predict(
# Enforced model generation
try:
if mode == TruthfulQAMode.MC1:
base_model = NumberModel
res: NumberModel = model.generate(
prompt=prompt, schema=NumberModel
)
prediction = str(res.answer)
elif mode == TruthfulQAMode.MC2:
base_model = ListOfNumbersModel
res: Union[ListOfNumbersModel | NumberModel] = model.generate(
prompt=prompt, schema=base_model
)
prediction = res.answer
res: ListOfNumbersModel = model.generate(
prompt=prompt, schema=ListOfNumbersModel
)
prediction = str(res.answer)

except TypeError:
if mode == TruthfulQAMode.MC1:
prompt += "\n\nOutput '1', '2', '3', '4', '5' etc. (number in front of answer choice). Full answer not needed."
elif mode == TruthfulQAMode.MC2:
prompt += "\n\nOutput the indices of all correct answers as a python list (e.g. '[1, 3, 4]'). Full answers are not needed."
prediction = model.generate(prompt)
prediction = str(model.generate(prompt))

print("Expected: " + (golden.expected_output))
print("Prediction: " + (prediction))

# For native models, shouldn't happen but just in case
if isinstance(prediction, tuple):
Expand Down Expand Up @@ -223,5 +229,5 @@ def load_benchmark_dataset(
input=input, expected_output=str(expected_output)
)
goldens.append(golden)

return goldens
1 change: 0 additions & 1 deletion deepeval/models/gpt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def log_retry_error(retry_state):
f"OpenAI rate limit exceeded. Retrying: {retry_state.attempt_number} time(s)..."
)


valid_gpt_models = [
"gpt-4o-mini",
"gpt-4o",
Expand Down
Loading

0 comments on commit d88d8b3

Please sign in to comment.