diff --git a/scripts/run_evaluation.py b/scripts/run_evaluation.py index 7d9d5fbdc..faa8df35a 100644 --- a/scripts/run_evaluation.py +++ b/scripts/run_evaluation.py @@ -3,13 +3,13 @@ Metrics used: Accuracy / Micro & Macro F1. """ import os +import numpy as np from sklearn import preprocessing from sklearn.metrics import f1_score, accuracy_score import argparse import json from tqdm import tqdm import datasets -import evaluate from run_inference import TunedCausalLM from shutil import rmtree @@ -141,109 +141,73 @@ def postprocess_output(output_text, delimiter): if delimiter is not None: return [text_substr.strip() for text_substr in output_text.split(delimiter)] return [output_text.strip()] -### Metric computation/display & utils for mapping labels to numerics for hf evaluate -def map_predictions_and_references_to_numerics(predictions: list, references: list) -> tuple: - """Maps string predictions and references to numerics for use in accuracy and - f1 computations. This process is generally ambiguous and can be done a number of - ways, but the strategy we use is as follows: - - Prior to consideration, all predictions and references are stripped of whitespace - - Map all unique reference values to integers - - Apply mapping of ref -> int to predictions; anything else is mapped to an unknown label val - where the unknown label is treated as its own class - Important caveats: - - this strategy is case sensitive - - this cannot be used for multioutput classification problems as is, since the entire - predicted text is treated as a single label - - be careful about the value of the max number of tokens for generation, since this - essentially boils down to a string match problem - - Args: - predictions: list - List of strings to be converted to class indices predicted by model. - references[list] - List of strings to be converted to class indices for ground truth. - - Returns: - tuple - Tuple containing: - int_predictions [list of ints] class indices for predicted samples - ref_predictions [list of ints] class indices for ground truth samples - label_map [dict] dict mapping indices to strings - """ - le = preprocessing.LabelEncoder() - le.fit(predictions) - # Label encoder maps from class indices from [0, n-1], so we use n as our throwaway class - unk_label = le.classes_.shape[0] - int_predictions = [get_encoded_label(le, pred, unk_label) for pred in predictions] - int_references = [get_encoded_label(le, references, unk_label) for pred in predictions] - # Generate the class mapping + the unk label +### Metric computation/display & utils for mapping labels to numerics for sklearn +def map_predictions_and_references_to_encoded_vectors(predictions: list, references: list): + ohe = preprocessing.OneHotEncoder() + # Extract the unique (potentially delimited labels) to fit the one hot encoder. We need to do + # this directly in case it's a multiclass/multilabel scenario, because the 2D arr consumed + # by the OHE expected consistent axis shapes, i.e., columns are treated as different features, + # and cannot have a variable number of values. + unk_label = "" + unique_labels = extract_unique_labels(predictions, references, unk_label) + ohe.fit(unique_labels) + + # Now get the encoded vectors for our references and our predictions by one hot encoding + # theunique sublabels and collapsing them into one vector along the row dimension. + reference_vectors = [get_encoded_vector(ohe, refs, unk_label) for refs in references] + pred_vectors = [get_encoded_vector(ohe, preds, unk_label) for preds in predictions] + + # For debugging purposes - map the indices in our none hot encoded entries. + # NOTE: the categories_ attr is a 2D array of features, and we only care about [0] + # since the uniquely extracted labels are only single dim features when fitting + # the transform itself. label_map = { - idx: label for idx, label in enumerate(le.inverse_transform(list(range(unk_label)))) + idx: label for idx, label in enumerate(ohe.categories_[0]) } - label_map[unk_label] = "" - return int_predictions, int_references, label_map - -def get_encoded_label(le: preprocessing.LabelEncoder, gen_text: str, unk_label: int) -> int: - """Gets the encoded label of a text string. - Args: - le: preprocessing.LabelEncode - Label Encoder object which maps text strings into class indices. - gen_text: str - Text that was generated as a label by the model. - unk_label: int - Label to be used for unknown / garbage generation, i.e., things unknown to the - label encoder. - - Returns: - int - The integer label index corresponding to the generated text. - """ - try: - return le.transform(gen_text)[0] - except ValueError: - # Model generated text that is not a valid label, i.e., is not in the label encoder - return unk_label - - -def compute_metrics_dict(int_preds: list, int_references: list) -> dict: - """Calculate the metrics on the (int) lists of preds against ground truth. - - Args: - int_preds: list - list of class indices for texts generated by the model. - int_references: list - list of class indices for ground truth labels. - - Returns: - dict - Dictionary containing F1 / accuracy metrics. - """ - f1_metric = evaluate.load("f1") - accuracy_metric = evaluate.load("accuracy") - # Compute the micro & macro f1 scores - micro_f1 = f1_metric.compute(predictions=int_preds, references=int_references, average="micro") - macro_f1 = f1_metric.compute(predictions=int_preds, references=int_references, average="macro") - # Compute the accuracy - accuracy = accuracy_metric.compute(predictions=int_preds, references=int_references) - return { - "f1": { - "micro": micro_f1, - "macro": macro_f1, - }, - "accuracy": accuracy - } - - -#### Replaces legacy logic -def map_predictions_and_references_to_one_hot_encoded_vectors(predictions: list, references: list): - # Currently it's a stub that we can use to validate our metric correctness - references = [[1,1,0], [1,0,1], [0,0,1], [0,0,0]] - predictions = [[0,1,1], [1,0,0], [0,0,1], [1,0,1]] - label_map = {0: "bird", 2: "cat", 3: "dog"} - # In this scenario, dog basically represents - return references, predictions, label_map + return pred_vectors, reference_vectors, label_map + +def get_encoded_vector(ohe, texts, unk_label) -> int: + # Since our encoded vector is built on collapsing one hot encoded vectors, + # we need to explicitly handle the empty case since it is not one hot encodable. + # raise ValueError(np.zeros(len(ohe.categories_[0])).dtype ) + if not texts: + return np.zeros(len(ohe.categories_[0])) + # Clean the generated text list; anything that is in the list that is not known to the + # one hot encoder gets replaced by the unk_label. It is okay if we have multiple unk_labels + # in the vector, since all of these just map to one positive entry in the encoded vector. + cleaned_texts = list(set([text if text in ohe.categories_[0] else unk_label for text in texts])) + # Encode the cleaned text as a 2D feature array of one hot encoded vectors + vec_stack = ohe.transform([[text] for text in cleaned_texts]).toarray() + + # Then collapse the one hot encoded vectors along the column dimension to get + # get the encoded binary vector for the multilabel / multiclass prediction. + return vec_stack.sum(axis=0) + +def extract_unique_labels(predictions, references, unk_label): + """Grab all of the unique labels and return them as a list of single feature lists.""" + unique_ref_labels = set() + for ref in references: + for sub_label in ref: + # This is pretty unlikely to happen (class named ""), but for now, raise + # if we see it happen, since that will currently mess up the results a little bit. + if sub_label == unk_label: + raise ValueError(f"Unk label {unk_label} is being used as a ground truth label!") + unique_ref_labels.add(sub_label) + + ref_label_list = [[label] for label in unique_ref_labels] + # HACK - traverse the predictions and see if any unk predictions were made; if so, make a + # garbage class, which we will mark as false positives here. + for pred in predictions: + for sub_pred in pred: + # One of our delimited predictions is unknown! + if sub_pred not in unique_ref_labels: + # Add the unk label once we know that it isn't a field in our eval data + print("Adding label to handle garbage label generation") + ref_label_list.append([unk_label]) + return ref_label_list + return ref_label_list def compute_metrics_dict_multi(enc_preds, enc_refs): micro_f1 = f1_score(enc_refs, enc_preds, average="micro") @@ -293,7 +257,7 @@ def export_experiment_info(metrics_dict: dict, label_map: dict, model_pred_file_ model = TunedCausalLM.load(args.model) data = datasets.load_dataset("json", data_files=args.data_path, split=args.split) predictions, references, model_pred_file_info = get_prediction_results(model, data, args.max_new_tokens, args.delimiter) - int_preds, int_references, label_map = map_predictions_and_references_to_one_hot_encoded_vectors(predictions, references) + int_preds, int_references, label_map = map_predictions_and_references_to_encoded_vectors(predictions, references) metrics_dict = compute_metrics_dict_multi(int_preds, int_references) experiment_metadata = { "model": args.model, @@ -301,17 +265,3 @@ def export_experiment_info(metrics_dict: dict, label_map: dict, model_pred_file_ "data_path": args.data_path, } export_experiment_info(metrics_dict, label_map, model_pred_file_info, experiment_metadata, args.output_dir) - - - -""" -python3 run_evaluation.py --model TinyLlama/TinyLlama-1.1B-step-50K-105b --data_path stanford_alpaca/alpaca_data.json --max_new_tokens 10 - -{ - 'input': '', - 'instruction': 'Give three tips for staying healthy.', - 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.' -} - -note: we need scikit learn and evaluate for this script [since f1 is also written on top of sklearn] -""" \ No newline at end of file