Skip to content

Commit

Permalink
Update eval scriipt for multioutput
Browse files Browse the repository at this point in the history
Signed-off-by: Alex-Brooks <[email protected]>
  • Loading branch information
alex-jw-brooks committed Apr 8, 2024
1 parent 0bac827 commit a444164
Showing 1 changed file with 65 additions and 115 deletions.
180 changes: 65 additions & 115 deletions scripts/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
Metrics used: Accuracy / Micro & Macro F1.
"""
import os
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import f1_score, accuracy_score
import argparse
import json
from tqdm import tqdm
import datasets
import evaluate
from run_inference import TunedCausalLM
from shutil import rmtree

Expand Down Expand Up @@ -141,109 +141,73 @@ def postprocess_output(output_text, delimiter):
if delimiter is not None:
return [text_substr.strip() for text_substr in output_text.split(delimiter)]
return [output_text.strip()]
### Metric computation/display & utils for mapping labels to numerics for hf evaluate
def map_predictions_and_references_to_numerics(predictions: list, references: list) -> tuple:
"""Maps string predictions and references to numerics for use in accuracy and
f1 computations. This process is generally ambiguous and can be done a number of
ways, but the strategy we use is as follows:

- Prior to consideration, all predictions and references are stripped of whitespace
- Map all unique reference values to integers
- Apply mapping of ref -> int to predictions; anything else is mapped to an unknown label val
where the unknown label is treated as its own class

Important caveats:
- this strategy is case sensitive
- this cannot be used for multioutput classification problems as is, since the entire
predicted text is treated as a single label
- be careful about the value of the max number of tokens for generation, since this
essentially boils down to a string match problem
Args:
predictions: list
List of strings to be converted to class indices predicted by model.
references[list]
List of strings to be converted to class indices for ground truth.
Returns:
tuple
Tuple containing:
int_predictions [list of ints] class indices for predicted samples
ref_predictions [list of ints] class indices for ground truth samples
label_map [dict] dict mapping indices to strings
"""
le = preprocessing.LabelEncoder()
le.fit(predictions)
# Label encoder maps from class indices from [0, n-1], so we use n as our throwaway class
unk_label = le.classes_.shape[0]
int_predictions = [get_encoded_label(le, pred, unk_label) for pred in predictions]
int_references = [get_encoded_label(le, references, unk_label) for pred in predictions]
# Generate the class mapping + the unk label
### Metric computation/display & utils for mapping labels to numerics for sklearn
def map_predictions_and_references_to_encoded_vectors(predictions: list, references: list):
ohe = preprocessing.OneHotEncoder()
# Extract the unique (potentially delimited labels) to fit the one hot encoder. We need to do
# this directly in case it's a multiclass/multilabel scenario, because the 2D arr consumed
# by the OHE expected consistent axis shapes, i.e., columns are treated as different features,
# and cannot have a variable number of values.
unk_label = "<UNKNOWN>"
unique_labels = extract_unique_labels(predictions, references, unk_label)
ohe.fit(unique_labels)

# Now get the encoded vectors for our references and our predictions by one hot encoding
# theunique sublabels and collapsing them into one vector along the row dimension.
reference_vectors = [get_encoded_vector(ohe, refs, unk_label) for refs in references]
pred_vectors = [get_encoded_vector(ohe, preds, unk_label) for preds in predictions]

# For debugging purposes - map the indices in our none hot encoded entries.
# NOTE: the categories_ attr is a 2D array of features, and we only care about [0]
# since the uniquely extracted labels are only single dim features when fitting
# the transform itself.
label_map = {
idx: label for idx, label in enumerate(le.inverse_transform(list(range(unk_label))))
idx: label for idx, label in enumerate(ohe.categories_[0])
}
label_map[unk_label] = "<UNKNOWN LABEL>"
return int_predictions, int_references, label_map

def get_encoded_label(le: preprocessing.LabelEncoder, gen_text: str, unk_label: int) -> int:
"""Gets the encoded label of a text string.
Args:
le: preprocessing.LabelEncode
Label Encoder object which maps text strings into class indices.
gen_text: str
Text that was generated as a label by the model.
unk_label: int
Label to be used for unknown / garbage generation, i.e., things unknown to the
label encoder.
Returns:
int
The integer label index corresponding to the generated text.
"""
try:
return le.transform(gen_text)[0]
except ValueError:
# Model generated text that is not a valid label, i.e., is not in the label encoder
return unk_label


def compute_metrics_dict(int_preds: list, int_references: list) -> dict:
"""Calculate the metrics on the (int) lists of preds against ground truth.
Args:
int_preds: list
list of class indices for texts generated by the model.
int_references: list
list of class indices for ground truth labels.
Returns:
dict
Dictionary containing F1 / accuracy metrics.
"""
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
# Compute the micro & macro f1 scores
micro_f1 = f1_metric.compute(predictions=int_preds, references=int_references, average="micro")
macro_f1 = f1_metric.compute(predictions=int_preds, references=int_references, average="macro")
# Compute the accuracy
accuracy = accuracy_metric.compute(predictions=int_preds, references=int_references)
return {
"f1": {
"micro": micro_f1,
"macro": macro_f1,
},
"accuracy": accuracy
}


#### Replaces legacy logic
def map_predictions_and_references_to_one_hot_encoded_vectors(predictions: list, references: list):
# Currently it's a stub that we can use to validate our metric correctness
references = [[1,1,0], [1,0,1], [0,0,1], [0,0,0]]
predictions = [[0,1,1], [1,0,0], [0,0,1], [1,0,1]]
label_map = {0: "bird", 2: "cat", 3: "dog"}
# In this scenario, dog basically represents <UNK>
return references, predictions, label_map
return pred_vectors, reference_vectors, label_map

def get_encoded_vector(ohe, texts, unk_label) -> int:
# Since our encoded vector is built on collapsing one hot encoded vectors,
# we need to explicitly handle the empty case since it is not one hot encodable.
# raise ValueError(np.zeros(len(ohe.categories_[0])).dtype )
if not texts:
return np.zeros(len(ohe.categories_[0]))
# Clean the generated text list; anything that is in the list that is not known to the
# one hot encoder gets replaced by the unk_label. It is okay if we have multiple unk_labels
# in the vector, since all of these just map to one positive entry in the encoded vector.
cleaned_texts = list(set([text if text in ohe.categories_[0] else unk_label for text in texts]))
# Encode the cleaned text as a 2D feature array of one hot encoded vectors
vec_stack = ohe.transform([[text] for text in cleaned_texts]).toarray()

# Then collapse the one hot encoded vectors along the column dimension to get
# get the encoded binary vector for the multilabel / multiclass prediction.
return vec_stack.sum(axis=0)

def extract_unique_labels(predictions, references, unk_label):
"""Grab all of the unique labels and return them as a list of single feature lists."""
unique_ref_labels = set()
for ref in references:
for sub_label in ref:
# This is pretty unlikely to happen (class named "<UNKNOWN>"), but for now, raise
# if we see it happen, since that will currently mess up the results a little bit.
if sub_label == unk_label:
raise ValueError(f"Unk label {unk_label} is being used as a ground truth label!")
unique_ref_labels.add(sub_label)

ref_label_list = [[label] for label in unique_ref_labels]
# HACK - traverse the predictions and see if any unk predictions were made; if so, make a
# garbage <UNKNOWN> class, which we will mark as false positives here.
for pred in predictions:
for sub_pred in pred:
# One of our delimited predictions is unknown!
if sub_pred not in unique_ref_labels:
# Add the unk label once we know that it isn't a field in our eval data
print("Adding <unk> label to handle garbage label generation")
ref_label_list.append([unk_label])
return ref_label_list
return ref_label_list

def compute_metrics_dict_multi(enc_preds, enc_refs):
micro_f1 = f1_score(enc_refs, enc_preds, average="micro")
Expand Down Expand Up @@ -293,25 +257,11 @@ def export_experiment_info(metrics_dict: dict, label_map: dict, model_pred_file_
model = TunedCausalLM.load(args.model)
data = datasets.load_dataset("json", data_files=args.data_path, split=args.split)
predictions, references, model_pred_file_info = get_prediction_results(model, data, args.max_new_tokens, args.delimiter)
int_preds, int_references, label_map = map_predictions_and_references_to_one_hot_encoded_vectors(predictions, references)
int_preds, int_references, label_map = map_predictions_and_references_to_encoded_vectors(predictions, references)
metrics_dict = compute_metrics_dict_multi(int_preds, int_references)
experiment_metadata = {
"model": args.model,
"max_new_tokens": args.max_new_tokens,
"data_path": args.data_path,
}
export_experiment_info(metrics_dict, label_map, model_pred_file_info, experiment_metadata, args.output_dir)



"""
python3 run_evaluation.py --model TinyLlama/TinyLlama-1.1B-step-50K-105b --data_path stanford_alpaca/alpaca_data.json --max_new_tokens 10
{
'input': '',
'instruction': 'Give three tips for staying healthy.',
'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'
}
note: we need scikit learn and evaluate for this script [since f1 is also written on top of sklearn]
"""

0 comments on commit a444164

Please sign in to comment.