Skip to content

Commit

Permalink
support json columns like chat_history (#1628)
Browse files Browse the repository at this point in the history
* support json columns like chat_history

* still drop chat_history due to mltable limit

* update preprocessor version

* update test

* remove trailing whitespace

* add doc string

---------

Co-authored-by: Richard Li <[email protected]>
  • Loading branch information
RichardLi1437 and Richard Li authored Nov 3, 2023
1 parent d6fce9f commit 0a9c24c
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: spark
name: model_data_collector_preprocessor
display_name: Model Data Collector - Preprocessor
description: Filters the data based on the window provided.
version: 0.3.5
version: 0.3.6
is_deterministic: true

code: ../../src
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from fsspec import AbstractFileSystem
from azureml.fsspec import AzureMachineLearningFileSystem
from datetime import datetime
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import lit, col
from shared_utilities.event_utils import post_warning_event
from shared_utilities.io_utils import (
init_spark,
Expand All @@ -27,7 +27,8 @@
MDC_CHAT_HISTORY_COLUMN,
MDC_CORRELATION_ID_COLUMN,
MDC_DATA_COLUMN,
MDC_DATAREF_COLUMN
MDC_DATAREF_COLUMN,
SCHEMA_INFER_ROW_COUNT
)

from typing import Tuple
Expand Down Expand Up @@ -158,15 +159,15 @@ def _get_data_columns(df: DataFrame) -> list:
return columns


def _extract_data_and_correlation_id(df: DataFrame, extract_correlation_id: bool, datastore: str) -> DataFrame:
def _extract_data_and_correlation_id(df: DataFrame, extract_correlation_id: bool, datastore: str = None) -> DataFrame:
"""
Extract data and correlation id from the MDC logs.
If data column exists, return the json contents in it,
otherwise, return the dataref content which is a url to the json file.
"""

def read_data(row):
def read_data(row) -> str:
data = getattr(row, MDC_DATA_COLUMN, None)
if data:
return data
Expand All @@ -178,19 +179,33 @@ def read_data(row):
return data_url
# TODO: Move this to tracking stream if both data and dataref are NULL

# Output MLTable
def row_to_pdf(row) -> pd.DataFrame:
return pd.read_json(read_data(row))

data_columns = _get_data_columns(df)
first_data_row = df.select(data_columns).rdd.map(lambda x: x).first()
data_rows = df.select(data_columns).rdd.take(SCHEMA_INFER_ROW_COUNT) # TODO: make it an argument user can define

spark = init_spark()
data_as_df = spark.createDataFrame(pd.read_json(read_data(first_data_row)))
infer_pdf = pd.concat([row_to_pdf(row) for row in data_rows], ignore_index=True)
data_as_df = spark.createDataFrame(infer_pdf)
# data_as_df.show()
# data_as_df.printSchema()

# The temporary workaround to remove the chat_history column if it exists.
# We are removing the column because the pyspark DF is unable to parse it.
# This version of the MDC is applied only to GSQ.
if MDC_CHAT_HISTORY_COLUMN in data_as_df.columns:
data_as_df = data_as_df.drop(col(MDC_CHAT_HISTORY_COLUMN))

def extract_data_and_correlation_id(entry, correlationid):
result = pd.read_json(entry)
result[MDC_CORRELATION_ID_COLUMN] = ""
for index, row in result.iterrows():
result.loc[index, MDC_CORRELATION_ID_COLUMN] = (
correlationid + "_" + str(index)
)
return result

def tranform_df_function_with_correlation_id(iterator):
for df in iterator:
yield pd.concat(
Expand All @@ -201,15 +216,6 @@ def tranform_df_function_with_correlation_id(iterator):
for row in df.itertuples()
)

def extract_data_and_correlation_id(entry, correlationid):
result = pd.read_json(entry)
result[MDC_CORRELATION_ID_COLUMN] = ""
for index, row in result.iterrows():
result.loc[index, MDC_CORRELATION_ID_COLUMN] = (
correlationid + "_" + str(index)
)
return result

def transform_df_function_without_correlation_id(iterator):
for df in iterator:
yield pd.concat(
Expand Down Expand Up @@ -247,6 +253,7 @@ def _raw_mdc_uri_folder_to_preprocessed_spark_df(
df = _convert_mltable_to_spark_df(table, preprocessed_input_data, fs)
# print("df after converting mltable to spark df:")
# df.show()
# df.printSchema()

if not df:
print("Skipping the Model Data Collector preprocessor.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,4 @@
MDC_CORRELATION_ID_COLUMN = 'correlationid'
MDC_DATA_COLUMN = 'data'
MDC_DATAREF_COLUMN = 'dataref'
SCHEMA_INFER_ROW_COUNT = 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"specversion":"1.0","id":"82c8f4c4-8c40-4c14-8a98-55a957004924","source":"/subscriptions/15ae9cb6-95c1-483d-a0e3-b1a1a3b06324/resourceGroups/copilot-demo-eastus/providers/Microsoft.MachineLearningServices/workspaces/aistudio_townhall/onlineEndpoints/aistudio-townhall-kuxtz/deployments/aistudio-townhall-kuxtz-1","type":"azureml.inference.model_inputs","datacontenttype":"application/json","time":"2023-10-24T22:20:24Z","data":[{"question":"price of trail shoes ?","chat_history":[]}],"contentrange":"bytes 0-56/57","correlationid":"6c4861f8-fa33-42ff-bc93-cf8f459ee013","xrequestid":"6c4861f8-fa33-42ff-bc93-cf8f459ee013","modelversion":"default","collectdatatype":"pandas.core.frame.DataFrame","agent":"azureml-ai-monitoring/0.1.0b3"}
{"specversion":"1.0","id":"8b52369e-7fb9-454b-a8f5-6c845137fd5f","source":"/subscriptions/15ae9cb6-95c1-483d-a0e3-b1a1a3b06324/resourceGroups/copilot-demo-eastus/providers/Microsoft.MachineLearningServices/workspaces/aistudio_townhall/onlineEndpoints/aistudio-townhall-kuxtz/deployments/aistudio-townhall-kuxtz-1","type":"azureml.inference.model_inputs","datacontenttype":"application/json","time":"2023-10-24T22:24:55Z","data":[{"question":"What is the proper care for trailwalker hiking shoes?","chat_history":[{"inputs":{"question":"How much does TrailWalker Hiking Shoes cost? "},"outputs":{"output":"The TrailWalker Hiking Shoes cost $110. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_11.md)"}}]}],"modelversion":"default","collectdatatype":"pandas.core.frame.DataFrame","agent":"azureml-ai-monitoring/0.1.0b3","contentrange":"bytes 0-384/385","correlationid":"7415b7a9-47d5-44bd-8d50-8762ff542d2a","xrequestid":"7415b7a9-47d5-44bd-8d50-8762ff542d2a"}
{"specversion":"1.0","id":"c26e3d0f-2f2d-42af-949a-d267e0318824","source":"/subscriptions/15ae9cb6-95c1-483d-a0e3-b1a1a3b06324/resourceGroups/copilot-demo-eastus/providers/Microsoft.MachineLearningServices/workspaces/aistudio_townhall/onlineEndpoints/aistudio-townhall-kuxtz/deployments/aistudio-townhall-kuxtz-1","type":"azureml.inference.model_inputs","datacontenttype":"application/json","time":"2023-10-24T22:26:11Z","data":[{"question":"What is the material for PowerBurner Camping Stove?","chat_history":[{"inputs":{"question":"How much does TrailWalker Hiking Shoes cost? "},"outputs":{"output":"The TrailWalker Hiking Shoes cost $110. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_11.md)"}},{"inputs":{"question":"What is the proper care for trailwalker hiking shoes?"},"outputs":{"output":"According to the product information, proper care and maintenance for the TrailWalker Hiking Shoes include removing any dirt or debris after each use, rinsing them with clean water and gently scrubbing with a soft brush if they are muddy or heavily soiled, allowing them to air dry naturally, periodically applying a waterproofing treatment, inspecting them regularly for any signs of wear and tear, and storing them in a cool, dry place when not in use. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_11.md)"}},{"inputs":{"question":"What brand is for TrailMaster tent? "},"outputs":{"output":"The brand for the TrailMaster X4 Tent is OutdoorLiving. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_1.md)"}},{"inputs":{"question":"How do I carry the TrailMaster tent around?"},"outputs":{"output":"The TrailMaster X4 Tent comes with a carry bag for easy transport. When packed in its carry bag, it can be comfortably carried during hikes. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_1.md)"}},{"inputs":{"question":"What is the floor area for Floor Area?"},"outputs":{"output":"The floor area for the TrailMaster X4 Tent is 80 square feet. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_1.md)"}},{"inputs":{"question":"What is the material for TrailBlaze Hiking Pants"},"outputs":{"output":"The TrailBlaze Hiking Pants are made of high-quality nylon fabric. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_10.md)"}},{"inputs":{"question":"What color does TrailBlaze Hiking Pants come in"},"outputs":{"output":"The TrailBlaze Hiking Pants come in khaki color. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_10.md)"}},{"inputs":{"question":"Cant he warrenty for TrailBlaze pants be transfered?"},"outputs":{"output":"No, the warranty for the TrailBlaze Hiking Pants is non-transferable and applies only to the original purchaser of the product. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_10.md)"}},{"inputs":{"question":"How long are the TrailBlaze pants under warrenty for?"},"outputs":{"output":"The TrailBlaze Hiking Pants are covered by a limited warranty for 1 year from the date of purchase. (Source: azureml:\/\/locations\/eastus\/workspaces\/41f8d98d-f5e7-4bb3-8f74-b1f6a33acb5b\/data\/vector-index-input-1697916790707\/versions\/1\/product_info_10.md)"}}]}],"agent":"azureml-ai-monitoring/0.1.0b3","contentrange":"bytes 0-3511/3512","correlationid":"f313a05b-5f84-449e-a6c6-046ef2f39409","xrequestid":"f313a05b-5f84-449e-a6c6-046ef2f39409","modelversion":"default","collectdatatype":"pandas.core.frame.DataFrame"}
Loading

0 comments on commit 0a9c24c

Please sign in to comment.