Skip to content

Commit

Permalink
update the compute feature importances model monitoring component to …
Browse files Browse the repository at this point in the history
…use TreeExplainer to optimize execution time (#2163)
  • Loading branch information
imatiach-msft authored Jan 23, 2024
1 parent 233acea commit 33e3606
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: pipeline
name: data_drift_signal_monitor
display_name: Data Drift - Signal Monitor
description: Computes the data drift between a baseline and production data assets.
version: 0.3.26
version: 0.3.27
is_deterministic: true

inputs:
Expand Down Expand Up @@ -63,7 +63,7 @@ outputs:
jobs:
compute_feature_importances:
type: spark
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.9
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.10
inputs:
baseline_data:
type: mltable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: pipeline
name: data_quality_signal_monitor
display_name: Data Quality - Signal Monitor
description: Computes the data quality of a target dataset with reference to a baseline.
version: 0.3.24
version: 0.3.25
is_deterministic: true

inputs:
Expand Down Expand Up @@ -63,7 +63,7 @@ outputs:
jobs:
compute_feature_importances:
type: spark
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.9
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.10
inputs:
baseline_data:
type: mltable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: pipeline
name: feature_attribution_drift_signal_monitor
display_name: Feature Attribution Drift - Signal Monitor
description: Computes the feature attribution between a baseline and production data assets.
version: 0.3.17
version: 0.3.18
is_deterministic: true

inputs:
Expand Down Expand Up @@ -44,7 +44,7 @@ outputs:
jobs:
compute_baseline_explanations:
type: spark
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.9
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.10
inputs:
baseline_data:
type: mltable
Expand All @@ -63,7 +63,7 @@ jobs:
type: aml_token
compute_production_explanations:
type: spark
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.9
component: azureml://registries/azureml/components/feature_importance_metrics/versions/0.3.10
inputs:
baseline_data:
type: mltable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ $schema: http://azureml/sdk-2-0/SparkComponent.json
type: spark

name: feature_importance_metrics
version: 0.3.9
version: 0.3.10
display_name: Feature importance
is_deterministic: true
description: Feature importance for model monitoring.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@
from shared_utilities.momo_exceptions import InvalidInputError
from shared_utilities import constants
from sklearn.model_selection import train_test_split
from responsibleai import RAIInsights, FeatureMetadata
from ml_wrappers.model.predictions_wrapper import (
PredictionsModelWrapperClassification,
PredictionsModelWrapperRegression)
from interpret_community.shap.tree_explainer import TreeExplainer

try:
from lightgbm import LGBMClassifier, LGBMRegressor
Expand Down Expand Up @@ -107,8 +104,8 @@ def create_lightgbm_model(X, y, task_type):
return model


def get_model_wrapper(task_type, target_column, train_data):
"""Create model wrapper using ml-wrappers on which to calculate feature importances.
def get_model(task_type, target_column, baseline_data):
"""Create a lightgbm model on which to calculate feature importances.
:param task_type: The task type (regression or classification) of the resulting model
:type task_type: string
Expand All @@ -117,26 +114,12 @@ def get_model_wrapper(task_type, target_column, train_data):
:param baseline_data: The baseline data meaning the data used to create the
model monitor
:type baseline_data: pandas.DataFrame
:return: an appropriate model wrapper
:rtype: PredictionsModelWrapperRegression or PredictionsModelWrapperClassification
:return: The trained lightgbm surrogate model
:rtype: LGBMClassifier or LGBMRegressor
"""
y_train = train_data[target_column]
x_train = train_data.drop([target_column], axis=1)
model = create_lightgbm_model(x_train, y_train, task_type)
model_predict = model.predict(x_train)
log_time_and_message("Called predict on model")

if task_type == constants.CLASSIFICATION:
model_predict_proba = model.predict_proba(x_train)
model_wrapper = PredictionsModelWrapperClassification(
x_train,
model_predict,
model_predict_proba)
else:
model_wrapper = PredictionsModelWrapperRegression(x_train, model_predict)

log_time_and_message("Created ml wrapper")
return model_wrapper
y_train = baseline_data[target_column]
x_train = baseline_data.drop([target_column], axis=1)
return create_lightgbm_model(x_train, y_train, task_type)


def get_train_test_data(data):
Expand All @@ -159,11 +142,11 @@ def get_train_test_data(data):
return train_data, test_data


def compute_explanations(model_wrapper, train_data, test_data, categorical_features, target_column, task_type):
def compute_explanations(model, train_data, test_data, categorical_features, target_column, task_type):
"""Compute explanations (feature importances) for a given dataset.
:param model_wrapper: wrapper around a model that can be used to calculate explanations
:type model_wrapper: PredictionsModelWrapperRegression or PredictionsModelWrapperClassification
:param model: surrogate model that can be used to calculate explanations
:type model: LGBMClassifier or LGBMRegressor
:param data: The data used to calculate the explanations
:type data: pandas.Dataframe
:param categorical_features: categorical features not including the target column
Expand All @@ -175,19 +158,11 @@ def compute_explanations(model_wrapper, train_data, test_data, categorical_featu
:return: explanation scores for the input data
:rtype: list[float]
"""
# Create the RAI Insights object, split baseline data into train and test data
feature_metadata = FeatureMetadata(categorical_features=categorical_features, dropped_features=[])

rai_i: RAIInsights = RAIInsights(
model_wrapper, train_data, test_data, target_column, task_type, feature_metadata=feature_metadata
)
log_time_and_message("Created RAIInsights")
# Add the global explanations using batching to allow for larger input data sizes
rai_i.explainer.add()
evaluation_data = train_data.drop([target_column], axis=1)
evaluation_data = test_data.drop([target_column], axis=1)
explainer = TreeExplainer(model)
log_time_and_message("Requesting explanations")
explanationData = rai_i.explainer.request_explanations(local=False, data=evaluation_data)
return explanationData.precomputedExplanations.globalFeatureImportance['scores']
global_explanation = explainer.explain_global(evaluation_data, include_local=False)
return global_explanation.global_importance_values


def compute_feature_importance(task_type, target_column, lgbm_df, categorical_features):
Expand All @@ -204,12 +179,11 @@ def compute_feature_importance(task_type, target_column, lgbm_df, categorical_fe
:return: list of feature importances in the order of the columns in the baseline data
:rtype: list[float]
"""
model_wrapper = get_model_wrapper(task_type, target_column, lgbm_df)
model = get_model(task_type, target_column, lgbm_df)
train_data, test_data = get_train_test_data(lgbm_df)
baseline_explanations = compute_explanations(
model_wrapper, train_data, test_data, categorical_features, target_column, task_type)
model, train_data, test_data, categorical_features, target_column, task_type)
log_time_and_message("Successfully computed explanations for dataset")

return baseline_explanations


Expand Down Expand Up @@ -295,9 +269,8 @@ def run(args):
check_df_has_target_column_with_error(baseline_df, args.target_column)

numerical_features, categorical_features = get_numerical_and_categorical_cols(
baseline_df,
args.override_numerical_features,
args.override_categorical_features)
baseline_df, args.override_numerical_features,
args.override_categorical_features)
baseline_df = baseline_df.toPandas()

task_type = determine_task_type(args.task_type, args.target_column, baseline_df, categorical_features)
Expand Down

0 comments on commit 33e3606

Please sign in to comment.