diff --git a/scripts/run_evaluation.py b/scripts/run_evaluation.py
index 7d9d5fbdc..faa8df35a 100644
--- a/scripts/run_evaluation.py
+++ b/scripts/run_evaluation.py
@@ -3,13 +3,13 @@
 Metrics used: Accuracy / Micro & Macro F1.
 """
 import os
+import numpy as np
 from sklearn import preprocessing
 from sklearn.metrics import f1_score, accuracy_score
 import argparse
 import json
 from tqdm import tqdm
 import datasets
-import evaluate
 from run_inference import TunedCausalLM
 from shutil import rmtree
 
@@ -141,109 +141,73 @@ def postprocess_output(output_text, delimiter):
     if delimiter is not None:
         return [text_substr.strip() for text_substr in output_text.split(delimiter)]
     return [output_text.strip()]
-### Metric computation/display & utils for mapping labels to numerics for hf evaluate
-def map_predictions_and_references_to_numerics(predictions: list, references: list) -> tuple:
-    """Maps string predictions and references to numerics for use in accuracy and
-    f1 computations. This process is generally ambiguous and can be done a number of
-    ways, but the strategy we use is as follows:
 
-    - Prior to consideration, all predictions and references are stripped of whitespace
-    - Map all unique reference values to integers
-    - Apply mapping of ref -> int to predictions; anything else is mapped to an unknown label val
-      where the unknown label is treated as its own class
 
-    Important caveats:
-    - this strategy is case sensitive
-    - this cannot be used for multioutput classification problems as is, since the entire
-      predicted text is treated as a single label
-    - be careful about the value of the max number of tokens for generation, since this
-      essentially boils down to a string match problem
-
-    Args:
-        predictions: list
-            List of strings to be converted to class indices predicted by model.
-        references[list]
-            List of strings to be converted to class indices for ground truth.
-
-    Returns:
-        tuple
-            Tuple containing:
-                int_predictions [list of ints] class indices for predicted samples
-                ref_predictions [list of ints] class indices for ground truth samples
-                label_map [dict] dict mapping indices to strings
-    """
-    le = preprocessing.LabelEncoder()
-    le.fit(predictions)
-    # Label encoder maps from class indices from [0, n-1], so we use n as our throwaway class
-    unk_label = le.classes_.shape[0]
-    int_predictions = [get_encoded_label(le, pred, unk_label) for pred in predictions]
-    int_references = [get_encoded_label(le, references, unk_label) for pred in predictions]
-    # Generate the class mapping + the unk label
+### Metric computation/display & utils for mapping labels to numerics for sklearn
+def map_predictions_and_references_to_encoded_vectors(predictions: list, references: list):
+    ohe = preprocessing.OneHotEncoder()
+    # Extract the unique (potentially delimited labels) to fit the one hot encoder. We need to do
+    # this directly in case it's a multiclass/multilabel scenario, because the 2D arr consumed
+    # by the OHE expected consistent axis shapes, i.e., columns are treated as different features,
+    # and cannot have a variable number of values.
+    unk_label = "<UNKNOWN>"
+    unique_labels = extract_unique_labels(predictions, references, unk_label)
+    ohe.fit(unique_labels)
+
+    # Now get the encoded vectors for our references and our predictions by one hot encoding
+    # theunique sublabels and collapsing them into one vector along the row dimension.
+    reference_vectors = [get_encoded_vector(ohe, refs, unk_label) for refs in references]
+    pred_vectors = [get_encoded_vector(ohe, preds, unk_label) for preds in predictions]
+
+    # For debugging purposes - map the indices in our none hot encoded entries.
+    # NOTE: the categories_ attr is a 2D array of features, and we only care about [0]
+    # since the uniquely extracted labels are only single dim features when fitting
+    # the transform itself.
     label_map = {
-        idx: label for idx, label in enumerate(le.inverse_transform(list(range(unk_label))))
+        idx: label for idx, label in enumerate(ohe.categories_[0])
     }
-    label_map[unk_label] = "<UNKNOWN LABEL>"
-    return int_predictions, int_references, label_map
-
-def get_encoded_label(le: preprocessing.LabelEncoder, gen_text: str, unk_label: int) -> int:
-    """Gets the encoded label of a text string.
-    Args:
-        le: preprocessing.LabelEncode
-            Label Encoder object which maps text strings into class indices.
-        gen_text: str
-            Text that was generated as a label by the model.
-        unk_label: int
-            Label to be used for unknown / garbage generation, i.e., things unknown to the
-            label encoder.
-
-    Returns:
-        int
-            The integer label index corresponding to the generated text.
-    """
-    try:
-        return le.transform(gen_text)[0]
-    except ValueError:
-        # Model generated text that is not a valid label, i.e., is not in the label encoder
-        return unk_label
-
-
-def compute_metrics_dict(int_preds: list, int_references: list) -> dict:
-    """Calculate the metrics on the (int) lists of preds against ground truth.
-    
-    Args:
-        int_preds: list
-            list of class indices for texts generated by the model.
-        int_references: list
-            list of class indices for ground truth labels.
-
-    Returns:
-        dict
-            Dictionary containing F1 / accuracy metrics.
-    """
-    f1_metric = evaluate.load("f1")
-    accuracy_metric = evaluate.load("accuracy")
-    # Compute the micro & macro f1 scores
-    micro_f1 = f1_metric.compute(predictions=int_preds, references=int_references, average="micro")
-    macro_f1 = f1_metric.compute(predictions=int_preds, references=int_references, average="macro")
-    # Compute the accuracy
-    accuracy = accuracy_metric.compute(predictions=int_preds, references=int_references)
-    return {
-        "f1": {
-            "micro": micro_f1,
-            "macro": macro_f1,
-        },
-        "accuracy": accuracy
-    }
-
-
-#### Replaces legacy logic
-def map_predictions_and_references_to_one_hot_encoded_vectors(predictions: list, references: list):
-    # Currently it's a stub that we can use to validate our metric correctness
-    references = [[1,1,0], [1,0,1], [0,0,1], [0,0,0]]
-    predictions = [[0,1,1], [1,0,0], [0,0,1], [1,0,1]]
-    label_map = {0: "bird", 2: "cat", 3: "dog"}
-    # In this scenario, dog basically represents <UNK>
-    return references, predictions, label_map
+    return pred_vectors, reference_vectors, label_map
+
+def get_encoded_vector(ohe, texts, unk_label) -> int:
+    # Since our encoded vector is built on collapsing one hot encoded vectors,
+    # we need to explicitly handle the empty case since it is not one hot encodable.
+    # raise ValueError(np.zeros(len(ohe.categories_[0])).dtype )
+    if not texts:
+        return np.zeros(len(ohe.categories_[0]))
+    # Clean the generated text list; anything that is in the list that is not known to the
+    # one hot encoder gets replaced by the unk_label. It is okay if we have multiple unk_labels
+    # in the vector, since all of these just map to one positive entry in the encoded vector.
+    cleaned_texts = list(set([text if text in ohe.categories_[0] else unk_label for text in texts]))
+    # Encode the cleaned text as a 2D feature array of one hot encoded vectors
+    vec_stack = ohe.transform([[text] for text in cleaned_texts]).toarray()
+
+    # Then collapse the one hot encoded vectors along the column dimension to get
+    # get the encoded binary vector for the multilabel / multiclass prediction.
+    return vec_stack.sum(axis=0)
+
+def extract_unique_labels(predictions, references, unk_label):
+    """Grab all of the unique labels and return them as a list of single feature lists."""
+    unique_ref_labels = set()
+    for ref in references:
+        for sub_label in ref:
+            # This is pretty unlikely to happen (class named "<UNKNOWN>"), but for now, raise
+            # if we see it happen, since that will currently mess up the results a little bit.
+            if sub_label == unk_label:
+                raise ValueError(f"Unk label {unk_label} is being used as a ground truth label!")                
+            unique_ref_labels.add(sub_label)
+
+    ref_label_list = [[label] for label in unique_ref_labels]
+    # HACK - traverse the predictions and see if any unk predictions were made; if so, make a
+    # garbage <UNKNOWN> class, which we will mark as false positives here.
+    for pred in predictions:
+        for sub_pred in pred:
+            # One of our delimited predictions is unknown!
+            if sub_pred not in unique_ref_labels:
+                # Add the unk label once we know that it isn't a field in our eval data
+                print("Adding <unk> label to handle garbage label generation")
+                ref_label_list.append([unk_label])
+                return ref_label_list
+    return ref_label_list
 
 def compute_metrics_dict_multi(enc_preds, enc_refs):
     micro_f1 = f1_score(enc_refs, enc_preds, average="micro")
@@ -293,7 +257,7 @@ def export_experiment_info(metrics_dict: dict, label_map: dict, model_pred_file_
     model = TunedCausalLM.load(args.model)
     data = datasets.load_dataset("json", data_files=args.data_path, split=args.split)
     predictions, references, model_pred_file_info = get_prediction_results(model, data, args.max_new_tokens, args.delimiter)
-    int_preds, int_references, label_map = map_predictions_and_references_to_one_hot_encoded_vectors(predictions, references)
+    int_preds, int_references, label_map = map_predictions_and_references_to_encoded_vectors(predictions, references)
     metrics_dict = compute_metrics_dict_multi(int_preds, int_references)
     experiment_metadata = {
         "model": args.model,
@@ -301,17 +265,3 @@ def export_experiment_info(metrics_dict: dict, label_map: dict, model_pred_file_
         "data_path": args.data_path,
     }
     export_experiment_info(metrics_dict, label_map, model_pred_file_info, experiment_metadata, args.output_dir)
-
-
-
-"""
-python3 run_evaluation.py --model TinyLlama/TinyLlama-1.1B-step-50K-105b  --data_path stanford_alpaca/alpaca_data.json  --max_new_tokens 10
-
-{
-    'input': '',
-    'instruction': 'Give three tips for staying healthy.',
-    'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'
-}
-
-note: we need scikit learn and evaluate for this script [since f1 is also written on top of sklearn]
-"""
\ No newline at end of file