Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluate multiple output component model #906

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 69 additions & 20 deletions bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,11 @@ def print_labeled_confusion_matrix(confusion_matrix, labels, is_multilabel=False
)


def sort_class_names(class_names):
def sort_class_names(class_names, is_multioutput):
if len(class_names) == 2:
class_names = sorted(list(class_names), reverse=True)
elif is_multioutput:
class_names = [sorted(item) for item in class_names]
else:
class_names = sorted(list(class_names))

Expand All @@ -135,6 +137,8 @@ def __init__(self, lemmatization=False):

self.calculate_importance = True

self.is_multioutput_model = False

@property
def le(self):
"""Classifier agnostic getter for the label encoder property"""
Expand Down Expand Up @@ -287,7 +291,7 @@ def print_feature_importances(

def train(self, importance_cutoff=0.15):
classes, self.class_names = self.get_labels()
self.class_names = sort_class_names(self.class_names)
self.class_names = sort_class_names(self.class_names, self.is_multioutput_model)

# Get items and labels, filtering out those for which we have no labels.
X_iter, y_iter = split_tuple_iterator(self.items_gen(classes))
Expand All @@ -300,7 +304,7 @@ def train(self, importance_cutoff=0.15):

print(f"X: {X.shape}, y: {y.shape}")

is_multilabel = isinstance(y[0], np.ndarray)
is_multilabel = isinstance(y[0], np.ndarray) and not self.is_multioutput_model

# Split dataset in training and test.
X_train, X_test, y_train, y_test = train_test_split(
Expand Down Expand Up @@ -374,30 +378,75 @@ def train(self, importance_cutoff=0.15):
y_pred[0], np.ndarray
), "The predictions should be multilabel"

print(f"No confidence threshold - {len(y_test)} classified")
if is_multilabel:
confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred)
if self.is_multioutput_model:
tracking_metrics["report"] = {}
tracking_metrics["confusion_matrix"] = {}
class_names = self.class_names
test_labels = y_test.T
for num, y_pred in enumerate(y_pred.T):
y_test = test_labels[num]
self.class_names = class_names[num]
output = ["product", "component", "Conflated component"]

print(
f"No confidence threshold - {len(y_test)} classified for {output[num]} model"
)
if is_multilabel:
confusion_matrix = metrics.multilabel_confusion_matrix(
y_test, y_pred
)
else:
confusion_matrix = metrics.confusion_matrix(
y_test, y_pred, labels=self.class_names
)

print(
classification_report_imbalanced(
y_test, y_pred, labels=self.class_names
)
)
report = classification_report_imbalanced_values(
y_test, y_pred, labels=self.class_names
)

tracking_metrics["report"][output[num]] = report

print_labeled_confusion_matrix(
confusion_matrix, self.class_names, is_multilabel=is_multilabel
)

tracking_metrics["confusion_matrix"][
output[num]
] = confusion_matrix.tolist()

joblib.dump(self, self.__class__.__name__.lower())

return tracking_metrics
else:
confusion_matrix = metrics.confusion_matrix(
y_test, y_pred, labels=self.class_names
)
print(f"No confidence threshold - {len(y_test)} classified")
if is_multilabel:
confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred)
else:
confusion_matrix = metrics.confusion_matrix(
y_test, y_pred, labels=self.class_names
)

print(
classification_report_imbalanced(
print(
classification_report_imbalanced(
y_test, y_pred, labels=self.class_names
)
)
report = classification_report_imbalanced_values(
y_test, y_pred, labels=self.class_names
)
)
report = classification_report_imbalanced_values(
y_test, y_pred, labels=self.class_names
)

tracking_metrics["report"] = report
tracking_metrics["report"] = report

print_labeled_confusion_matrix(
confusion_matrix, self.class_names, is_multilabel=is_multilabel
)
print_labeled_confusion_matrix(
confusion_matrix, self.class_names, is_multilabel=is_multilabel
)

tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()
tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()

# Evaluate results on the test set for some confidence thresholds.
for confidence_threshold in [0.6, 0.7, 0.8, 0.9]:
Expand Down
21 changes: 13 additions & 8 deletions bugbug/models/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

from collections import Counter

import numpy as np
import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

from bugbug import bug_features, bugzilla, feature_cleanup
Expand Down Expand Up @@ -59,6 +61,7 @@ def __init__(self, lemmatization=False):

self.cross_validation_enabled = False
self.calculate_importance = False
self.is_multioutput_model = True

feature_extractors = [
bug_features.has_str(),
Expand Down Expand Up @@ -105,8 +108,7 @@ def __init__(self, lemmatization=False):
]
)

self.clf = xgboost.XGBClassifier(n_jobs=16)
self.clf.set_params(predictor="cpu_predictor")
self.clf = MultiOutputClassifier(xgboost.XGBClassifier(n_jobs=16))

self.CONFLATED_COMPONENTS_INVERSE_MAPPING = {
v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items()
Expand Down Expand Up @@ -145,12 +147,12 @@ def get_labels(self):

classes = {}
for bug_id, (product, component) in product_components.items():
component = self.filter_component(product, component)
filtered_component = self.filter_component(product, component)

if component:
classes[bug_id] = component
if filtered_component:
classes[bug_id] = [product, component, filtered_component]

component_counts = Counter(classes.values()).most_common()
component_counts = Counter([comp[2] for comp in classes.values()]).most_common()
top_components = set(component for component, count in component_counts)

print(f"{len(top_components)} components")
Expand Down Expand Up @@ -181,10 +183,13 @@ def get_labels(self):
classes = {
bug_id: component
for bug_id, component in classes.items()
if component in top_components
if component[2] in top_components
}

return classes, set(classes.values())
class_names = [
set(item) for item in np.array([item for item in classes.values()]).T
]
return classes, class_names

def is_meaningful(self, product, component):
return product in self.PRODUCTS and component not in ["General", "Untriaged"]
Expand Down