From d7d01dc83038ede3a9804d7114bceee465249cc0 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Sat, 19 Oct 2024 09:59:24 -0500 Subject: [PATCH 01/24] Add files via upload Confusion matrices and MCC --- dspy/evaluate/confusion.py | 388 +++++++++++++++++++++++++++++++++++++ 1 file changed, 388 insertions(+) create mode 100644 dspy/evaluate/confusion.py diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py new file mode 100644 index 000000000..bdcfc5a6d --- /dev/null +++ b/dspy/evaluate/confusion.py @@ -0,0 +1,388 @@ +import re +import numpy as np +from dspy.dspy import LabeledFewShot, BootstrapFewShot +from dspy.dspy.evaluate.evaluate import * +import random + +from dspy.dspy.teleprompt.teleprompt import Teleprompter + + +class Confusion: + def __init__( + self, + *, + labels, + devset, + num_threads=1, + display_progress=False, + display_table=False, + max_errors=5, + return_matrix=False, + return_outputs=False, + provide_traceback=False, + **_kwargs, + ): + self.labels = labels + self.devset = devset + self.num_threads = num_threads + self.display_progress = display_progress + self.display_table = display_table + self.max_errors = max_errors + self.error_count = 0 + self.error_lock = threading.Lock() + self.cancel_jobs = threading.Event() + self.return_matrix = return_matrix + self.return_outputs = return_outputs + self.provide_traceback = provide_traceback + self.results = {label: [] for label in labels} + + def extract_answer_from_prediction(self, prediction): + response = prediction["response"] + match = re.search(r"|".join(self.labels), response.lower()) + return match.group(0) if match else None + + def construct_matrix(self): + labels = self.labels + + # Initialize the confusion matrix + confusion_matrix = np.zeros((len(labels), len(labels)), dtype=int) + + # Fill the confusion matrix + for idx, label in enumerate(labels): + for prediction in self.results[label]: + answer = self.extract_answer_from_prediction(prediction) + if answer in labels: + confusion_matrix[idx][labels.index(answer)] += 1 + + return confusion_matrix + + def get_matthews_corrcoef(self): + C = self.construct_matrix() + # rest is from sklearn + + t_sum = C.sum(axis=1, dtype=np.float64) + p_sum = C.sum(axis=0, dtype=np.float64) + n_correct = np.trace(C, dtype=np.float64) + n_samples = p_sum.sum() + cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) + cov_ypyp = n_samples ** 2 - np.dot(p_sum, p_sum) + cov_ytyt = n_samples ** 2 - np.dot(t_sum, t_sum) + + if cov_ypyp * cov_ytyt == 0: + return 0.0 + else: + return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) + + def _execute_single_thread(self, wrapped_program, devset, display_progress): + reordered_devset = [] + + pbar = tqdm.tqdm(total=len(devset), dynamic_ncols=True, disable=not display_progress, file=sys.stdout) + for idx, arg in devset: + with logging_redirect_tqdm(): + example_idx, example, prediction = wrapped_program(idx, arg) + reordered_devset.append((example_idx, example, prediction)) + self.results[arg["response"]].append(prediction) + self._update_progress(pbar) + + pbar.close() + + return reordered_devset + + def _execute_multi_thread(self, wrapped_program, devset, num_threads, display_progress): + reordered_devset = [] + job_cancelled = "cancelled" + + # context manger to handle sigint + @contextlib.contextmanager + def interrupt_handler_manager(): + """Sets the cancel_jobs event when a SIGINT is received.""" + default_handler = signal.getsignal(signal.SIGINT) + + def interrupt_handler(sig, frame): + self.cancel_jobs.set() + dspy.logger.warning("Received SIGINT. Cancelling evaluation.") + default_handler(sig, frame) + + signal.signal(signal.SIGINT, interrupt_handler) + yield + # reset to the default handler + signal.signal(signal.SIGINT, default_handler) + + def cancellable_wrapped_program(idx, arg): + # If the cancel_jobs event is set, return the cancelled_job literal + if self.cancel_jobs.is_set(): + return None, None, job_cancelled, None + return arg, wrapped_program(idx, arg) + + with ThreadPoolExecutor(max_workers=num_threads) as executor, interrupt_handler_manager(): + futures = {executor.submit(cancellable_wrapped_program, idx, arg) for idx, arg in devset} + pbar = tqdm.tqdm(total=len(devset), dynamic_ncols=True, disable=not display_progress) + + for future in as_completed(futures): + arg, (example_idx, example, prediction) = future.result() + + # use the cancelled_job literal to check if the job was cancelled - use "is" not "==" + # in case the prediction is "cancelled" for some reason. + if prediction is job_cancelled: + continue + + reordered_devset.append((example_idx, example, prediction)) + self.results[arg["response"]].append(prediction) + self._update_progress(pbar) + pbar.close() + + if self.cancel_jobs.is_set(): + dspy.logger.warning("Evaluation was cancelled. The results may be incomplete.") + raise KeyboardInterrupt + + return reordered_devset + + def _update_progress(self, pbar): + mcc = self.get_matthews_corrcoef() + pbar.set_description(f"MCC: {mcc:.6f}") + pbar.update() + + def __call__( + self, + program, + devset=None, + num_threads=None, + display_progress=None, + display_table=None, + return_matrix=None, + return_outputs=None, + ): + devset = devset if devset is not None else self.devset + num_threads = num_threads if num_threads is not None else self.num_threads + display_progress = display_progress if display_progress is not None else self.display_progress + display_table = display_table if display_table is not None else self.display_table + return_matrix = return_matrix if return_matrix is not None else self.return_matrix + return_outputs = return_outputs if return_outputs is not None else self.return_outputs + results = [] + + def wrapped_program(example_idx, example): + # NOTE: TODO: Won't work if threads create threads! + thread_stacks = dspy.settings.stack_by_thread + creating_new_thread = threading.get_ident() not in thread_stacks + if creating_new_thread: + thread_stacks[threading.get_ident()] = list(dspy.settings.main_stack) + + try: + prediction = program(**example.inputs()) + + # increment assert and suggest failures to program's attributes + if hasattr(program, "_assert_failures"): + program._assert_failures += dspy.settings.get("assert_failures") + if hasattr(program, "_suggest_failures"): + program._suggest_failures += dspy.settings.get("suggest_failures") + + return example_idx, example, prediction + except Exception as e: + with self.error_lock: + self.error_count += 1 + current_error_count = self.error_count + if current_error_count >= self.max_errors: + raise e + + if self.provide_traceback: + dspy.logger.error( + f"Error for example in dev set: \t\t {e}\n\twith inputs:\n\t\t{example.inputs()}\n\nStack trace:\n\t{traceback.format_exc()}" + ) + else: + dspy.logger.error( + f"Error for example in dev set: \t\t {e}. Set `provide_traceback=True` to see the stack trace." + ) + + return example_idx, example, {}, 0.0 + finally: + if creating_new_thread: + del thread_stacks[threading.get_ident()] + + devset = list(enumerate(devset)) + tqdm.tqdm._instances.clear() + + if num_threads == 1: + reordered_devset = self._execute_single_thread(wrapped_program, devset, display_progress) + else: + reordered_devset = self._execute_multi_thread( + wrapped_program, + devset, + num_threads, + display_progress, + ) + + dspy.logger.info(f"MCC: {self.get_matthews_corrcoef():.6f}") + + predicted_devset = sorted(reordered_devset) + + if return_outputs: # Handle the return_outputs logic + results = [(example, prediction) for _, example, prediction in predicted_devset] + + data = [merge_dicts(example, prediction) for _, example, prediction in predicted_devset] + + result_df = pd.DataFrame(data) + + # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0) + result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell) + + if display_table: + if isinstance(display_table, bool): + df_to_display = result_df.copy() + truncated_rows = 0 + else: + df_to_display = result_df.head(display_table).copy() + truncated_rows = len(result_df) - display_table + + display_dataframe(df_to_display) + + if truncated_rows > 0: + # Simplified message about the truncated rows + message = f""" +
+ ... {truncated_rows} more rows not displayed ... +
+ """ + display(HTML(message)) + + mcc = self.get_matthews_corrcoef() + + if return_matrix and return_outputs: + return mcc, results, self.construct_matrix() + if return_matrix: + return mcc, self.construct_matrix() + if return_outputs: + return mcc, results + + return mcc + + +class MCCBootstrapFewShotWithRandomSearch(Teleprompter): + def __init__( + self, + labels, + teacher_settings={}, + max_bootstrapped_demos=4, + max_labeled_demos=16, + max_rounds=1, + num_candidate_programs=16, + num_threads=6, + max_errors=10, + stop_at_score=None, + metric_threshold=None, + ): + self.labels = labels + self.teacher_settings = teacher_settings + self.max_rounds = max_rounds + + self.num_threads = num_threads + self.stop_at_score = stop_at_score + self.metric_threshold = metric_threshold + self.min_num_samples = 1 + self.max_num_samples = max_bootstrapped_demos + self.max_errors = max_errors + self.num_candidate_sets = num_candidate_programs + self.max_labeled_demos = max_labeled_demos + + print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.") + print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") + + def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): + self.trainset = trainset + self.valset = valset or trainset # TODO: FIXME: Note this choice. + + scores = [] + score_data = [] + + for seed in range(-3, self.num_candidate_sets): + if (restrict is not None) and (seed not in restrict): + continue + + trainset_copy = list(self.trainset) + + if seed == -3: + # zero-shot + program = student.reset_copy() + + elif seed == -2: + # labels only + teleprompter = LabeledFewShot(k=self.max_labeled_demos) + program = teleprompter.compile(student, trainset=trainset_copy, sample=labeled_sample) + + elif seed == -1: + # unshuffled few-shot + optimizer = BootstrapFewShot( + metric_threshold=self.metric_threshold, + max_bootstrapped_demos=self.max_num_samples, + max_labeled_demos=self.max_labeled_demos, + teacher_settings=self.teacher_settings, + max_rounds=self.max_rounds, + max_errors=self.max_errors, + ) + program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) + + else: + assert seed >= 0, seed + + random.Random(seed).shuffle(trainset_copy) + size = random.Random(seed).randint(self.min_num_samples, self.max_num_samples) + + optimizer = BootstrapFewShot( + metric_threshold=self.metric_threshold, + max_bootstrapped_demos=size, + max_labeled_demos=self.max_labeled_demos, + teacher_settings=self.teacher_settings, + max_rounds=self.max_rounds, + max_errors=self.max_errors, + ) + + program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) + + confusion = Confusion( + labels=self.labels, + devset=self.valset, + num_threads=self.num_threads, + max_errors=self.max_errors, + display_table=False, + display_progress=True, + ) + + score, cm = confusion(program, return_matrix=True) + + ############ Assertion-aware Optimization ############ + if hasattr(program, "_suggest_failures"): + score = score - program._suggest_failures * 0.2 + if hasattr(program, "_assert_failures"): + score = 0 if program._assert_failures > 0 else score + ###################################################### + + if len(scores) == 0 or score > max(scores): + print("New best score:", score, "for seed", seed) + best_program = program + + scores.append(score) + print(f"Scores so far: {scores}") + print(f"Best score so far: {max(scores)}") + + score_data.append((score, + pd.DataFrame(cm, + index=pd.Index(self.labels, name="Predicted"), + columns=pd.Index(self.labels, name="Actual")), + seed, + program)) + + if self.stop_at_score is not None and score >= self.stop_at_score: + print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}") + break + + # To best program, attach all program candidates in decreasing average score + best_program.candidate_programs = score_data + best_program.candidate_programs = sorted(best_program.candidate_programs, key=lambda x: x[0], reverse=True) + + print(f"{len(best_program.candidate_programs)} candidate programs found.") + + return best_program From 7372e789dc492231e75cb5c8aea19b629e20b9b3 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Sat, 19 Oct 2024 12:59:53 -0500 Subject: [PATCH 02/24] Update confusion.py Fixes --- dspy/evaluate/confusion.py | 52 +++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index bdcfc5a6d..63a976133 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -1,10 +1,10 @@ import re import numpy as np -from dspy.dspy import LabeledFewShot, BootstrapFewShot -from dspy.dspy.evaluate.evaluate import * +from dspy import LabeledFewShot, BootstrapFewShot +from dspy.evaluate.evaluate import * import random -from dspy.dspy.teleprompt.teleprompt import Teleprompter +from dspy.teleprompt.teleprompt import Teleprompter class Confusion: @@ -34,14 +34,13 @@ def __init__( self.return_matrix = return_matrix self.return_outputs = return_outputs self.provide_traceback = provide_traceback - self.results = {label: [] for label in labels} def extract_answer_from_prediction(self, prediction): response = prediction["response"] match = re.search(r"|".join(self.labels), response.lower()) return match.group(0) if match else None - def construct_matrix(self): + def construct_matrix(self, preds): labels = self.labels # Initialize the confusion matrix @@ -49,15 +48,15 @@ def construct_matrix(self): # Fill the confusion matrix for idx, label in enumerate(labels): - for prediction in self.results[label]: + for prediction in preds[label]: answer = self.extract_answer_from_prediction(prediction) if answer in labels: confusion_matrix[idx][labels.index(answer)] += 1 return confusion_matrix - def get_matthews_corrcoef(self): - C = self.construct_matrix() + def get_matthews_corrcoef(self, preds, return_cm=False): + C = self.construct_matrix(preds) # rest is from sklearn t_sum = C.sum(axis=1, dtype=np.float64) @@ -69,11 +68,15 @@ def get_matthews_corrcoef(self): cov_ytyt = n_samples ** 2 - np.dot(t_sum, t_sum) if cov_ypyp * cov_ytyt == 0: - return 0.0 + out = 0.0 else: - return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) + out = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) + + if return_cm: + return out, C + return out - def _execute_single_thread(self, wrapped_program, devset, display_progress): + def _execute_single_thread(self, wrapped_program, devset, display_progress, preds): reordered_devset = [] pbar = tqdm.tqdm(total=len(devset), dynamic_ncols=True, disable=not display_progress, file=sys.stdout) @@ -81,14 +84,14 @@ def _execute_single_thread(self, wrapped_program, devset, display_progress): with logging_redirect_tqdm(): example_idx, example, prediction = wrapped_program(idx, arg) reordered_devset.append((example_idx, example, prediction)) - self.results[arg["response"]].append(prediction) - self._update_progress(pbar) + preds[arg["response"]].append(prediction) + self._update_progress(pbar, preds) pbar.close() return reordered_devset - def _execute_multi_thread(self, wrapped_program, devset, num_threads, display_progress): + def _execute_multi_thread(self, wrapped_program, devset, num_threads, display_progress, preds): reordered_devset = [] job_cancelled = "cancelled" @@ -127,8 +130,8 @@ def cancellable_wrapped_program(idx, arg): continue reordered_devset.append((example_idx, example, prediction)) - self.results[arg["response"]].append(prediction) - self._update_progress(pbar) + preds[arg["response"]].append(prediction) + self._update_progress(pbar, preds) pbar.close() if self.cancel_jobs.is_set(): @@ -137,8 +140,8 @@ def cancellable_wrapped_program(idx, arg): return reordered_devset - def _update_progress(self, pbar): - mcc = self.get_matthews_corrcoef() + def _update_progress(self, pbar, preds): + mcc = self.get_matthews_corrcoef(preds) pbar.set_description(f"MCC: {mcc:.6f}") pbar.update() @@ -201,17 +204,20 @@ def wrapped_program(example_idx, example): devset = list(enumerate(devset)) tqdm.tqdm._instances.clear() + preds = {label: [] for label in self.labels} + if num_threads == 1: - reordered_devset = self._execute_single_thread(wrapped_program, devset, display_progress) + reordered_devset = self._execute_single_thread(wrapped_program, devset, display_progress, preds) else: reordered_devset = self._execute_multi_thread( wrapped_program, devset, num_threads, display_progress, + preds, ) - dspy.logger.info(f"MCC: {self.get_matthews_corrcoef():.6f}") + dspy.logger.info(f"MCC: {self.get_matthews_corrcoef(preds):.6f}") predicted_devset = sorted(reordered_devset) @@ -249,12 +255,12 @@ def wrapped_program(example_idx, example): """ display(HTML(message)) - mcc = self.get_matthews_corrcoef() + mcc, cm = self.get_matthews_corrcoef(preds, return_cm=True) if return_matrix and return_outputs: - return mcc, results, self.construct_matrix() + return mcc, results, cm if return_matrix: - return mcc, self.construct_matrix() + return mcc, cm if return_outputs: return mcc, results From 31586c917bb4ce5a1e9c047c30ae43e1fe1cf5c7 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Sat, 19 Oct 2024 14:18:12 -0500 Subject: [PATCH 03/24] class-weighted confusion/mcc --- dspy/evaluate/confusion.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 63a976133..9c2a97f74 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -1,4 +1,5 @@ import re +from collections import Counter import numpy as np from dspy import LabeledFewShot, BootstrapFewShot from dspy.evaluate.evaluate import * @@ -34,6 +35,7 @@ def __init__( self.return_matrix = return_matrix self.return_outputs = return_outputs self.provide_traceback = provide_traceback + self.freqs = {k: 1 / v for k, v in Counter([arg["response"] for arg in devset]).items()} def extract_answer_from_prediction(self, prediction): response = prediction["response"] @@ -44,14 +46,17 @@ def construct_matrix(self, preds): labels = self.labels # Initialize the confusion matrix - confusion_matrix = np.zeros((len(labels), len(labels)), dtype=int) + confusion_matrix = np.zeros((len(labels), len(labels)), dtype=float) + + # Get answers + answers = {label: [self.extract_answer_from_prediction(prediction) + for prediction in preds[label]] for label in labels} # Fill the confusion matrix for idx, label in enumerate(labels): - for prediction in preds[label]: - answer = self.extract_answer_from_prediction(prediction) - if answer in labels: - confusion_matrix[idx][labels.index(answer)] += 1 + for answer in answers[label]: + if answer in self.freqs: + confusion_matrix[idx][labels.index(answer)] += self.freqs[label] return confusion_matrix From 28485132373a88bc4681114a29b4d5c69dc3e895 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Sun, 20 Oct 2024 00:16:41 -0500 Subject: [PATCH 04/24] fixes --- dspy/evaluate/confusion.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 9c2a97f74..4be3c05a5 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -35,7 +35,8 @@ def __init__( self.return_matrix = return_matrix self.return_outputs = return_outputs self.provide_traceback = provide_traceback - self.freqs = {k: 1 / v for k, v in Counter([arg["response"] for arg in devset]).items()} + self.freqs = Counter([example["response"] for example in devset]) + self.inv_freqs = {k: 1 / v for k, v in self.freqs.items()} def extract_answer_from_prediction(self, prediction): response = prediction["response"] @@ -55,8 +56,8 @@ def construct_matrix(self, preds): # Fill the confusion matrix for idx, label in enumerate(labels): for answer in answers[label]: - if answer in self.freqs: - confusion_matrix[idx][labels.index(answer)] += self.freqs[label] + if answer in self.inv_freqs: + confusion_matrix[idx][labels.index(answer)] += self.inv_freqs[label] return confusion_matrix @@ -76,9 +77,13 @@ def get_matthews_corrcoef(self, preds, return_cm=False): out = 0.0 else: out = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) - + if return_cm: - return out, C + cm = pd.DataFrame(C, + index=pd.Index(self.labels, name="Actual"), + columns=pd.Index(self.labels, name="Predicted")) + cm["support"] = self.freqs + return out, cm return out def _execute_single_thread(self, wrapped_program, devset, display_progress, preds): @@ -380,9 +385,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None print(f"Best score so far: {max(scores)}") score_data.append((score, - pd.DataFrame(cm, - index=pd.Index(self.labels, name="Predicted"), - columns=pd.Index(self.labels, name="Actual")), + cm, seed, program)) From 5666f88d021a4ba9e4a7c929c7f2edb4aee25a2f Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Sun, 20 Oct 2024 18:54:50 -0500 Subject: [PATCH 05/24] Took out incorrect support calc --- dspy/evaluate/confusion.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 4be3c05a5..1bedb57c4 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -35,8 +35,7 @@ def __init__( self.return_matrix = return_matrix self.return_outputs = return_outputs self.provide_traceback = provide_traceback - self.freqs = Counter([example["response"] for example in devset]) - self.inv_freqs = {k: 1 / v for k, v in self.freqs.items()} + self.inv_freqs = {k: 1 / v for k, v in Counter([example["response"] for example in devset]).items()} def extract_answer_from_prediction(self, prediction): response = prediction["response"] @@ -82,7 +81,6 @@ def get_matthews_corrcoef(self, preds, return_cm=False): cm = pd.DataFrame(C, index=pd.Index(self.labels, name="Actual"), columns=pd.Index(self.labels, name="Predicted")) - cm["support"] = self.freqs return out, cm return out From 7f5f61c362d19dc98581581efff53cc3a5b73f4d Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Mon, 21 Oct 2024 08:36:07 -0500 Subject: [PATCH 06/24] Fixed confusion matrix/MCC class weighting and made it optional (but still the default). --- dspy/evaluate/confusion.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 1bedb57c4..216b2a9d6 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -1,10 +1,10 @@ +import random import re -from collections import Counter + import numpy as np + from dspy import LabeledFewShot, BootstrapFewShot from dspy.evaluate.evaluate import * -import random - from dspy.teleprompt.teleprompt import Teleprompter @@ -21,6 +21,7 @@ def __init__( return_matrix=False, return_outputs=False, provide_traceback=False, + use_class_weight=True, **_kwargs, ): self.labels = labels @@ -35,28 +36,29 @@ def __init__( self.return_matrix = return_matrix self.return_outputs = return_outputs self.provide_traceback = provide_traceback - self.inv_freqs = {k: 1 / v for k, v in Counter([example["response"] for example in devset]).items()} + self.use_class_weight = use_class_weight - def extract_answer_from_prediction(self, prediction): + def extract_response_from_prediction(self, prediction): response = prediction["response"] match = re.search(r"|".join(self.labels), response.lower()) return match.group(0) if match else None def construct_matrix(self, preds): + weight = {label: 1 / len(pred) if self.use_class_weight else 1 for label, pred in preds.items()} + labels = self.labels # Initialize the confusion matrix - confusion_matrix = np.zeros((len(labels), len(labels)), dtype=float) + confusion_matrix = np.zeros((len(labels), len(labels)), dtype=np.float64) # Get answers - answers = {label: [self.extract_answer_from_prediction(prediction) - for prediction in preds[label]] for label in labels} + responses = {label: [self.extract_response_from_prediction(pred) for pred in preds[label]] for label in labels} # Fill the confusion matrix for idx, label in enumerate(labels): - for answer in answers[label]: - if answer in self.inv_freqs: - confusion_matrix[idx][labels.index(answer)] += self.inv_freqs[label] + for response in responses[label]: + if response in labels: + confusion_matrix[idx][labels.index(response)] += weight[label] return confusion_matrix @@ -288,6 +290,7 @@ def __init__( max_errors=10, stop_at_score=None, metric_threshold=None, + use_class_weight=True, ): self.labels = labels self.teacher_settings = teacher_settings @@ -301,6 +304,8 @@ def __init__( self.max_errors = max_errors self.num_candidate_sets = num_candidate_programs self.max_labeled_demos = max_labeled_demos + + self.use_class_weight = use_class_weight print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.") print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") @@ -363,6 +368,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None max_errors=self.max_errors, display_table=False, display_progress=True, + use_class_weight=self.use_class_weight, ) score, cm = confusion(program, return_matrix=True) From bb2e3e0eb33f70a502a765cef8b969beb690ffc8 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Mon, 21 Oct 2024 09:49:48 -0500 Subject: [PATCH 07/24] Slight cleanup --- dspy/evaluate/confusion.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 216b2a9d6..98fc97c5e 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -227,21 +227,23 @@ def wrapped_program(example_idx, example): preds, ) - dspy.logger.info(f"MCC: {self.get_matthews_corrcoef(preds):.6f}") + mcc, cm = self.get_matthews_corrcoef(preds, return_cm=True) + + dspy.logger.info(f"MCC: {mcc:.6f}") predicted_devset = sorted(reordered_devset) if return_outputs: # Handle the return_outputs logic results = [(example, prediction) for _, example, prediction in predicted_devset] - data = [merge_dicts(example, prediction) for _, example, prediction in predicted_devset] + if display_table: + data = [merge_dicts(example, prediction) for _, example, prediction in predicted_devset] - result_df = pd.DataFrame(data) + result_df = pd.DataFrame(data) - # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0) - result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell) + # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0) + result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell) - if display_table: if isinstance(display_table, bool): df_to_display = result_df.copy() truncated_rows = 0 @@ -265,8 +267,6 @@ def wrapped_program(example_idx, example): """ display(HTML(message)) - mcc, cm = self.get_matthews_corrcoef(preds, return_cm=True) - if return_matrix and return_outputs: return mcc, results, cm if return_matrix: From 05094210c6dda92f56d10ef7863de1c794e3d183 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Mon, 21 Oct 2024 10:07:52 -0500 Subject: [PATCH 08/24] Error fix --- dspy/evaluate/confusion.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 98fc97c5e..ffa0e4bbc 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -1,5 +1,6 @@ import random import re +from collections import Counter import numpy as np @@ -38,13 +39,14 @@ def __init__( self.provide_traceback = provide_traceback self.use_class_weight = use_class_weight - def extract_response_from_prediction(self, prediction): + def extract(self, prediction): response = prediction["response"] match = re.search(r"|".join(self.labels), response.lower()) return match.group(0) if match else None - def construct_matrix(self, preds): - weight = {label: 1 / len(pred) if self.use_class_weight else 1 for label, pred in preds.items()} + def construct_matrix(self, preds, devset): + # use answers from devset to get weights + weight = {k: 1 / v for k, v in Counter([self.extract(arg) for _, arg in devset]).items()} labels = self.labels @@ -52,7 +54,7 @@ def construct_matrix(self, preds): confusion_matrix = np.zeros((len(labels), len(labels)), dtype=np.float64) # Get answers - responses = {label: [self.extract_response_from_prediction(pred) for pred in preds[label]] for label in labels} + responses = {label: [self.extract(pred) for pred in preds[label]] for label in labels} # Fill the confusion matrix for idx, label in enumerate(labels): @@ -62,8 +64,8 @@ def construct_matrix(self, preds): return confusion_matrix - def get_matthews_corrcoef(self, preds, return_cm=False): - C = self.construct_matrix(preds) + def get_matthews_corrcoef(self, preds, devset, return_cm=False): + C = self.construct_matrix(preds, devset) # rest is from sklearn t_sum = C.sum(axis=1, dtype=np.float64) @@ -141,7 +143,7 @@ def cancellable_wrapped_program(idx, arg): reordered_devset.append((example_idx, example, prediction)) preds[arg["response"]].append(prediction) - self._update_progress(pbar, preds) + self._update_progress(pbar, preds, devset) pbar.close() if self.cancel_jobs.is_set(): @@ -150,8 +152,8 @@ def cancellable_wrapped_program(idx, arg): return reordered_devset - def _update_progress(self, pbar, preds): - mcc = self.get_matthews_corrcoef(preds) + def _update_progress(self, pbar, preds, devset): + mcc = self.get_matthews_corrcoef(preds, devset) pbar.set_description(f"MCC: {mcc:.6f}") pbar.update() @@ -227,7 +229,7 @@ def wrapped_program(example_idx, example): preds, ) - mcc, cm = self.get_matthews_corrcoef(preds, return_cm=True) + mcc, cm = self.get_matthews_corrcoef(preds, devset, return_cm=True) dspy.logger.info(f"MCC: {mcc:.6f}") From 2e4bbd6a4a72f2ff89c454342866b1d26ed1a24b Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Mon, 21 Oct 2024 10:16:26 -0500 Subject: [PATCH 09/24] Slight fix --- dspy/evaluate/confusion.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index ffa0e4bbc..5749232ad 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -45,11 +45,12 @@ def extract(self, prediction): return match.group(0) if match else None def construct_matrix(self, preds, devset): - # use answers from devset to get weights - weight = {k: 1 / v for k, v in Counter([self.extract(arg) for _, arg in devset]).items()} - labels = self.labels + # use answers from devset to get weights + weight = {k: 1 / v for k, v in Counter([self.extract(arg) for _, arg in devset]).items()} \ + if self.use_class_weight else {k: 1 for k in labels} + # Initialize the confusion matrix confusion_matrix = np.zeros((len(labels), len(labels)), dtype=np.float64) From 3dd758db525ffb3e66dbada0199076b175231bfc Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Tue, 22 Oct 2024 10:25:46 -0500 Subject: [PATCH 10/24] Slight fix --- dspy/evaluate/confusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 5749232ad..05404f73f 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -98,7 +98,7 @@ def _execute_single_thread(self, wrapped_program, devset, display_progress, pred example_idx, example, prediction = wrapped_program(idx, arg) reordered_devset.append((example_idx, example, prediction)) preds[arg["response"]].append(prediction) - self._update_progress(pbar, preds) + self._update_progress(pbar, preds, devset) pbar.close() From 163735ff76fb687cb941a6e1b1915e6198721d95 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Thu, 24 Oct 2024 18:20:57 -0500 Subject: [PATCH 11/24] flexible output name --- dspy/evaluate/confusion.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 05404f73f..c4e531c9d 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -23,6 +23,7 @@ def __init__( return_outputs=False, provide_traceback=False, use_class_weight=True, + output_field_name="response", **_kwargs, ): self.labels = labels @@ -38,9 +39,10 @@ def __init__( self.return_outputs = return_outputs self.provide_traceback = provide_traceback self.use_class_weight = use_class_weight + self.output_field_name = output_field_name def extract(self, prediction): - response = prediction["response"] + response = prediction[self.output_field_name] match = re.search(r"|".join(self.labels), response.lower()) return match.group(0) if match else None @@ -97,7 +99,7 @@ def _execute_single_thread(self, wrapped_program, devset, display_progress, pred with logging_redirect_tqdm(): example_idx, example, prediction = wrapped_program(idx, arg) reordered_devset.append((example_idx, example, prediction)) - preds[arg["response"]].append(prediction) + preds[arg[self.output_field_name]].append(prediction) self._update_progress(pbar, preds, devset) pbar.close() @@ -143,7 +145,7 @@ def cancellable_wrapped_program(idx, arg): continue reordered_devset.append((example_idx, example, prediction)) - preds[arg["response"]].append(prediction) + preds[arg[self.output_field_name]].append(prediction) self._update_progress(pbar, preds, devset) pbar.close() @@ -294,6 +296,7 @@ def __init__( stop_at_score=None, metric_threshold=None, use_class_weight=True, + output_field_name="response", ): self.labels = labels self.teacher_settings = teacher_settings @@ -309,6 +312,7 @@ def __init__( self.max_labeled_demos = max_labeled_demos self.use_class_weight = use_class_weight + self.output_field_name = output_field_name print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.") print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") @@ -372,6 +376,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None display_table=False, display_progress=True, use_class_weight=self.use_class_weight, + output_field_name=self.output_field_name, ) score, cm = confusion(program, return_matrix=True) From 4b6980423424f5f56d9502be633bbe91d9172c88 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Thu, 24 Oct 2024 18:35:48 -0500 Subject: [PATCH 12/24] tweaks --- dspy/evaluate/confusion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index c4e531c9d..96094f5a7 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -50,11 +50,11 @@ def construct_matrix(self, preds, devset): labels = self.labels # use answers from devset to get weights - weight = {k: 1 / v for k, v in Counter([self.extract(arg) for _, arg in devset]).items()} \ + weight = {k: 1 / v for k, v in Counter([arg for _, arg in devset]).items()} \ if self.use_class_weight else {k: 1 for k in labels} # Initialize the confusion matrix - confusion_matrix = np.zeros((len(labels), len(labels)), dtype=np.float64) + confusion_matrix = np.zeros([len(labels)] * 2, dtype=np.float64) # Get answers responses = {label: [self.extract(pred) for pred in preds[label]] for label in labels} From 594ef27d3f55c293a1214b6c2742b009bb5e541c Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Thu, 24 Oct 2024 18:42:21 -0500 Subject: [PATCH 13/24] tweaks --- dspy/evaluate/confusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 96094f5a7..27b1e3e33 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -50,7 +50,7 @@ def construct_matrix(self, preds, devset): labels = self.labels # use answers from devset to get weights - weight = {k: 1 / v for k, v in Counter([arg for _, arg in devset]).items()} \ + weight = {k: 1 / v for k, v in Counter([arg[self.output_field_name] for _, arg in devset]).items()} \ if self.use_class_weight else {k: 1 for k in labels} # Initialize the confusion matrix From 3e09b43144c267f63a9eecdcbf10276c78c90282 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Fri, 25 Oct 2024 13:18:31 -0500 Subject: [PATCH 14/24] tweaks --- dspy/evaluate/confusion.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 27b1e3e33..d9d6c3acb 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -23,7 +23,7 @@ def __init__( return_outputs=False, provide_traceback=False, use_class_weight=True, - output_field_name="response", + output_field="response", **_kwargs, ): self.labels = labels @@ -39,10 +39,9 @@ def __init__( self.return_outputs = return_outputs self.provide_traceback = provide_traceback self.use_class_weight = use_class_weight - self.output_field_name = output_field_name + self.output_field = output_field - def extract(self, prediction): - response = prediction[self.output_field_name] + def extract(self, response): match = re.search(r"|".join(self.labels), response.lower()) return match.group(0) if match else None @@ -50,20 +49,20 @@ def construct_matrix(self, preds, devset): labels = self.labels # use answers from devset to get weights - weight = {k: 1 / v for k, v in Counter([arg[self.output_field_name] for _, arg in devset]).items()} \ + weight = {k: 1 / v for k, v in Counter([arg[self.output_field] for _, arg in devset]).items()} \ if self.use_class_weight else {k: 1 for k in labels} # Initialize the confusion matrix confusion_matrix = np.zeros([len(labels)] * 2, dtype=np.float64) # Get answers - responses = {label: [self.extract(pred) for pred in preds[label]] for label in labels} + answers = {label: [self.extract(pred) for pred in preds[label]] for label in labels} # Fill the confusion matrix for idx, label in enumerate(labels): - for response in responses[label]: - if response in labels: - confusion_matrix[idx][labels.index(response)] += weight[label] + for answer in answers[label]: + if answer in labels: + confusion_matrix[idx][labels.index(answer)] += weight[label] return confusion_matrix @@ -99,7 +98,7 @@ def _execute_single_thread(self, wrapped_program, devset, display_progress, pred with logging_redirect_tqdm(): example_idx, example, prediction = wrapped_program(idx, arg) reordered_devset.append((example_idx, example, prediction)) - preds[arg[self.output_field_name]].append(prediction) + preds[arg[self.output_field]].append(prediction[self.output_field]) self._update_progress(pbar, preds, devset) pbar.close() @@ -145,7 +144,7 @@ def cancellable_wrapped_program(idx, arg): continue reordered_devset.append((example_idx, example, prediction)) - preds[arg[self.output_field_name]].append(prediction) + preds[arg[self.output_field]].append(prediction[self.output_field]) self._update_progress(pbar, preds, devset) pbar.close() @@ -296,7 +295,7 @@ def __init__( stop_at_score=None, metric_threshold=None, use_class_weight=True, - output_field_name="response", + output_field="response", ): self.labels = labels self.teacher_settings = teacher_settings @@ -312,7 +311,7 @@ def __init__( self.max_labeled_demos = max_labeled_demos self.use_class_weight = use_class_weight - self.output_field_name = output_field_name + self.output_field = output_field print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.") print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") @@ -376,7 +375,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None display_table=False, display_progress=True, use_class_weight=self.use_class_weight, - output_field_name=self.output_field_name, + output_field=self.output_field, ) score, cm = confusion(program, return_matrix=True) From 628dd34c6fef4ba07643edd0d185359b3d73ba9d Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Fri, 25 Oct 2024 16:12:48 -0500 Subject: [PATCH 15/24] tweak --- dspy/evaluate/confusion.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index d9d6c3acb..e6f717b85 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -48,14 +48,18 @@ def extract(self, response): def construct_matrix(self, preds, devset): labels = self.labels - # use answers from devset to get weights - weight = {k: 1 / v for k, v in Counter([arg[self.output_field] for _, arg in devset]).items()} \ - if self.use_class_weight else {k: 1 for k in labels} + if self.use_class_weight: + # use devset to get weights + classes = [arg[self.output_field] for _, arg in devset] + class_counts = Counter(classes) + weight = {k: 1 / v for k, v in class_counts.items()} + else: + weight = {k: 1 for k in labels} # Initialize the confusion matrix confusion_matrix = np.zeros([len(labels)] * 2, dtype=np.float64) - # Get answers + # Get model answers answers = {label: [self.extract(pred) for pred in preds[label]] for label in labels} # Fill the confusion matrix From 92fd90daefae51a45af0b45330a3ed67d0a72c01 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Sat, 26 Oct 2024 14:30:51 -0500 Subject: [PATCH 16/24] error fix --- dspy/evaluate/confusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index e6f717b85..021314f11 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -214,7 +214,7 @@ def wrapped_program(example_idx, example): f"Error for example in dev set: \t\t {e}. Set `provide_traceback=True` to see the stack trace." ) - return example_idx, example, {}, 0.0 + return example_idx, example, {self.output_field: "error"} finally: if creating_new_thread: del thread_stacks[threading.get_ident()] From 95ef2719a79bb9a92c133266c99c27ff0c70f5e6 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Sun, 27 Oct 2024 13:31:50 -0500 Subject: [PATCH 17/24] tweak --- dspy/evaluate/confusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 021314f11..3078c7c41 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -48,8 +48,8 @@ def extract(self, response): def construct_matrix(self, preds, devset): labels = self.labels + # Calculate class weights if self.use_class_weight: - # use devset to get weights classes = [arg[self.output_field] for _, arg in devset] class_counts = Counter(classes) weight = {k: 1 / v for k, v in class_counts.items()} From 192c7d815b4caaaa7132ce468cffcb22a9304d92 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Tue, 29 Oct 2024 07:03:04 -0500 Subject: [PATCH 18/24] small efficiency tweaks --- dspy/evaluate/confusion.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 3078c7c41..7eb12f8b6 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -72,20 +72,23 @@ def construct_matrix(self, preds, devset): def get_matthews_corrcoef(self, preds, devset, return_cm=False): C = self.construct_matrix(preds, devset) - # rest is from sklearn + # t_sum = C.sum(axis=1, dtype=np.float64) p_sum = C.sum(axis=0, dtype=np.float64) - n_correct = np.trace(C, dtype=np.float64) n_samples = p_sum.sum() - cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) - cov_ypyp = n_samples ** 2 - np.dot(p_sum, p_sum) - cov_ytyt = n_samples ** 2 - np.dot(t_sum, t_sum) + n_samples_2 = n_samples ** 2 + cov_ypyp = n_samples_2 - np.dot(p_sum, p_sum) + cov_ytyt = n_samples_2 - np.dot(t_sum, t_sum) + prod = cov_ypyp * cov_ytyt - if cov_ypyp * cov_ytyt == 0: + if prod == 0: out = 0.0 else: - out = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) + n_correct = np.trace(C, dtype=np.float64) + cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) + out = cov_ytyp / np.sqrt(prod) + # if return_cm: cm = pd.DataFrame(C, From af3c088f8d7988f177e278e9a39e66b0b00b478a Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Tue, 29 Oct 2024 07:32:16 -0500 Subject: [PATCH 19/24] whitespace --- dspy/evaluate/confusion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 7eb12f8b6..ea6ee3d68 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -73,7 +73,7 @@ def construct_matrix(self, preds, devset): def get_matthews_corrcoef(self, preds, devset, return_cm=False): C = self.construct_matrix(preds, devset) - # + # t_sum = C.sum(axis=1, dtype=np.float64) p_sum = C.sum(axis=0, dtype=np.float64) n_samples = p_sum.sum() @@ -88,7 +88,7 @@ def get_matthews_corrcoef(self, preds, devset, return_cm=False): n_correct = np.trace(C, dtype=np.float64) cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) out = cov_ytyp / np.sqrt(prod) - # + # if return_cm: cm = pd.DataFrame(C, @@ -316,7 +316,7 @@ def __init__( self.max_errors = max_errors self.num_candidate_sets = num_candidate_programs self.max_labeled_demos = max_labeled_demos - + self.use_class_weight = use_class_weight self.output_field = output_field From e3e6799f0b183f28e962e4b056386ce1ccd3d9da Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Tue, 29 Oct 2024 10:56:51 -0500 Subject: [PATCH 20/24] infer labels from classes in devset --- dspy/evaluate/confusion.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index ea6ee3d68..2689208c9 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -13,7 +13,6 @@ class Confusion: def __init__( self, *, - labels, devset, num_threads=1, display_progress=False, @@ -26,7 +25,6 @@ def __init__( output_field="response", **_kwargs, ): - self.labels = labels self.devset = devset self.num_threads = num_threads self.display_progress = display_progress @@ -41,16 +39,19 @@ def __init__( self.use_class_weight = use_class_weight self.output_field = output_field - def extract(self, response): - match = re.search(r"|".join(self.labels), response.lower()) + def extract(self, response, labels): + match = re.search(r"|".join(labels), response.lower()) return match.group(0) if match else None - def construct_matrix(self, preds, devset): - labels = self.labels + def construct_labels_and_matrix(self, devset, preds=None): + classes = [arg[self.output_field] for _, arg in devset] + labels = np.unique(classes).tolist() + + if preds is None: + return labels # Calculate class weights if self.use_class_weight: - classes = [arg[self.output_field] for _, arg in devset] class_counts = Counter(classes) weight = {k: 1 / v for k, v in class_counts.items()} else: @@ -60,7 +61,7 @@ def construct_matrix(self, preds, devset): confusion_matrix = np.zeros([len(labels)] * 2, dtype=np.float64) # Get model answers - answers = {label: [self.extract(pred) for pred in preds[label]] for label in labels} + answers = {label: [self.extract(pred, labels) for pred in preds[label]] for label in labels} # Fill the confusion matrix for idx, label in enumerate(labels): @@ -68,10 +69,10 @@ def construct_matrix(self, preds, devset): if answer in labels: confusion_matrix[idx][labels.index(answer)] += weight[label] - return confusion_matrix + return labels, confusion_matrix - def get_matthews_corrcoef(self, preds, devset, return_cm=False): - C = self.construct_matrix(preds, devset) + def get_matthews_corrcoef(self, devset, preds, return_cm=False): + labels, C = self.construct_labels_and_matrix(devset, preds) # t_sum = C.sum(axis=1, dtype=np.float64) @@ -92,8 +93,8 @@ def get_matthews_corrcoef(self, preds, devset, return_cm=False): if return_cm: cm = pd.DataFrame(C, - index=pd.Index(self.labels, name="Actual"), - columns=pd.Index(self.labels, name="Predicted")) + index=pd.Index(labels, name="Actual"), + columns=pd.Index(labels, name="Predicted")) return out, cm return out @@ -162,7 +163,7 @@ def cancellable_wrapped_program(idx, arg): return reordered_devset def _update_progress(self, pbar, preds, devset): - mcc = self.get_matthews_corrcoef(preds, devset) + mcc = self.get_matthews_corrcoef(devset, preds) pbar.set_description(f"MCC: {mcc:.6f}") pbar.update() @@ -224,8 +225,10 @@ def wrapped_program(example_idx, example): devset = list(enumerate(devset)) tqdm.tqdm._instances.clear() + + labels = self.construct_labels_and_matrix(devset) - preds = {label: [] for label in self.labels} + preds = {label: [] for label in labels} if num_threads == 1: reordered_devset = self._execute_single_thread(wrapped_program, devset, display_progress, preds) @@ -238,7 +241,7 @@ def wrapped_program(example_idx, example): preds, ) - mcc, cm = self.get_matthews_corrcoef(preds, devset, return_cm=True) + mcc, cm = self.get_matthews_corrcoef(devset, preds, return_cm=True) dspy.logger.info(f"MCC: {mcc:.6f}") @@ -291,7 +294,6 @@ def wrapped_program(example_idx, example): class MCCBootstrapFewShotWithRandomSearch(Teleprompter): def __init__( self, - labels, teacher_settings={}, max_bootstrapped_demos=4, max_labeled_demos=16, @@ -304,7 +306,6 @@ def __init__( use_class_weight=True, output_field="response", ): - self.labels = labels self.teacher_settings = teacher_settings self.max_rounds = max_rounds @@ -375,7 +376,6 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) confusion = Confusion( - labels=self.labels, devset=self.valset, num_threads=self.num_threads, max_errors=self.max_errors, From f76af39e039b6373b51b18879151756f59f9abd4 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Tue, 29 Oct 2024 14:35:59 -0500 Subject: [PATCH 21/24] whitespace --- dspy/evaluate/confusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 2689208c9..a64470a34 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -225,7 +225,7 @@ def wrapped_program(example_idx, example): devset = list(enumerate(devset)) tqdm.tqdm._instances.clear() - + labels = self.construct_labels_and_matrix(devset) preds = {label: [] for label in labels} From e49c2ff6967c57c351738205832d14fe46fba2db Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Wed, 30 Oct 2024 07:40:42 -0500 Subject: [PATCH 22/24] added matching options besides first --- dspy/evaluate/confusion.py | 46 ++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index a64470a34..3617f18a4 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -1,6 +1,7 @@ +import inspect import random -import re from collections import Counter +from re import findall import numpy as np @@ -9,6 +10,16 @@ from dspy.teleprompt.teleprompt import Teleprompter +def most_votes(votes): + """Only returns a value if there is a single winner.""" + if votes: + counts = Counter(votes) + max_count = max(counts.values()) + winners = [label for label, count in counts.items() if count == max_count] + if len(winners) == 1: + return winners[0] + + class Confusion: def __init__( self, @@ -23,6 +34,8 @@ def __init__( provide_traceback=False, use_class_weight=True, output_field="response", + extract=None, + match="first", **_kwargs, ): self.devset = devset @@ -38,13 +51,32 @@ def __init__( self.provide_traceback = provide_traceback self.use_class_weight = use_class_weight self.output_field = output_field + if extract is None: + self.extract = self._extract + if callable(match): + self.match = match + elif match == "first": + self.match = lambda x: x[0] + elif match == "last": + self.match = lambda x: x[-1] + elif match == "most": + self.match = most_votes + else: + raise ValueError(f"Invalid match function: {match}") + elif not callable(extract) or len(inspect.signature(extract).parameters) != 2: + raise ValueError("The extract function must be callable and have two parameters (response and labels).") + else: + self.extract = extract + # match field is ignored - def extract(self, response, labels): - match = re.search(r"|".join(labels), response.lower()) - return match.group(0) if match else None + def _extract(self, response, labels): + found = findall(r"|".join(labels), response.lower()) + if not found: + return None + return self.match(found) def construct_labels_and_matrix(self, devset, preds=None): - classes = [arg[self.output_field] for _, arg in devset] + classes = [arg[self.output_field].lower() for _, arg in devset] labels = np.unique(classes).tolist() if preds is None: @@ -106,7 +138,7 @@ def _execute_single_thread(self, wrapped_program, devset, display_progress, pred with logging_redirect_tqdm(): example_idx, example, prediction = wrapped_program(idx, arg) reordered_devset.append((example_idx, example, prediction)) - preds[arg[self.output_field]].append(prediction[self.output_field]) + preds[arg[self.output_field]].append(prediction.get(self.output_field, "error")) self._update_progress(pbar, preds, devset) pbar.close() @@ -152,7 +184,7 @@ def cancellable_wrapped_program(idx, arg): continue reordered_devset.append((example_idx, example, prediction)) - preds[arg[self.output_field]].append(prediction[self.output_field]) + preds[arg[self.output_field]].append(prediction.get(self.output_field, "error")) self._update_progress(pbar, preds, devset) pbar.close() From 07f164c4701e520fb81c78775485e6a9d93c1e49 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Wed, 30 Oct 2024 07:46:09 -0500 Subject: [PATCH 23/24] slight rephrasing of `_extract` --- dspy/evaluate/confusion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index 3617f18a4..b6ff63719 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -71,9 +71,8 @@ def __init__( def _extract(self, response, labels): found = findall(r"|".join(labels), response.lower()) - if not found: - return None - return self.match(found) + if found: + return self.match(found) def construct_labels_and_matrix(self, devset, preds=None): classes = [arg[self.output_field].lower() for _, arg in devset] From 42548d1346a298edb0176788e75361d46689db00 Mon Sep 17 00:00:00 2001 From: Chris Coffee Date: Wed, 30 Oct 2024 07:52:33 -0500 Subject: [PATCH 24/24] removed redundant if statement --- dspy/evaluate/confusion.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dspy/evaluate/confusion.py b/dspy/evaluate/confusion.py index b6ff63719..217ddf037 100644 --- a/dspy/evaluate/confusion.py +++ b/dspy/evaluate/confusion.py @@ -12,12 +12,11 @@ def most_votes(votes): """Only returns a value if there is a single winner.""" - if votes: - counts = Counter(votes) - max_count = max(counts.values()) - winners = [label for label, count in counts.items() if count == max_count] - if len(winners) == 1: - return winners[0] + counts = Counter(votes) + max_count = max(counts.values()) + winners = [label for label, count in counts.items() if count == max_count] + if len(winners) == 1: + return winners[0] class Confusion: