From 451385c13c71f984a7395a9804595f64c1175751 Mon Sep 17 00:00:00 2001 From: yolaj-nhs Date: Tue, 15 Aug 2023 16:17:02 +0100 Subject: [PATCH] Updating user stories --- user_stories/README.md | 29 ++++++++++--------- .../user_story_1/user_story_1_researcher.py | 5 ++-- .../user_story_2/user_story_2_researcher.py | 11 ++++--- user_stories/user_story_2/user_story_2_tre.py | 23 ++++++++------- .../user_story_3/user_story_3_researcher.py | 2 +- user_stories/user_story_3/user_story_3_tre.py | 2 +- .../user_story_7/user_story_7_researcher.py | 4 +-- 7 files changed, 40 insertions(+), 36 deletions(-) diff --git a/user_stories/README.md b/user_stories/README.md index 88fbfa21..9c4bd0ec 100644 --- a/user_stories/README.md +++ b/user_stories/README.md @@ -1,36 +1,37 @@ ## User story 1: Ideal Case -- User creates an object “mydata” of type aisdc.attacks.dataset.Data and provides a separate code file that does the translation between the data in the format provided and the data in the format to be input to the machine any model. -- User creates a model “mymodel” from the safeXClassifier class and calls mymodel.fit(). -- User calls mymodel.preliminary_check() to make sure their hyper-parameters are within the TRE risk appetite for algorithm X. -- User calls mymodel.run_attack(mydata) for different attack types and iterates over different hyper-parameters until they have an accurate model, and they interpret attack results as safe. -- User calls myModel.request_release() with parameters modelsavefile.sav and again passing the mydata object (without it request_release does not run attacks). +- User creates an object "target" of type aisdc.attacks.target.Target and provides a separate code file that does the translation between the data in the format provided and the data in the format to be input to the machine any model. +- User creates a model "model" from the safeXClassifier class and calls model.fit(). +- User calls model.preliminary_check() to make sure their hyper-parameters are within the TRE risk appetite for algorithm X. +- User calls model.run_attack(target) for different attack types and iterates over different hyper-parameters until they have an accurate model, and they interpret attack results as safe. +- User calls model.request_release() with parameters modelsavefile.sav and again passing the target object (without it request_release does not run attacks). - LIRA, worst_case, and attribute_inference attacks are run automatically, - results are stored ready for the TRE output checkers to look at. - - System also saves the results of mymodel.posthoc_check() for poor practice, model edits etc. + - System also saves the results of model.posthoc_check() for poor practice, model edits etc. - TRE checker has everything they need to make a decision with no further processing. ## User story 2: Next Case -- User provides Data object and code, uses safeXClassifier() but does not pass data object to request_release() or save processed form of data. +- User provides Target object and code, uses safeXClassifier() but does not pass data object to request_release() or save processed form of data. - safeXClassifer report checks for class disclosure and TRE risk appetite for algorithm X. - TRE output checker has to manually recreate processed data using code provided. - TRE output checker is unable to run any attacks UNLESS they also know exactly which rows from the dataset were used for training and testing. -- So dataset object needs to store those specific details OR use fixed values for seed (e.g. to sklearn.train_test_split() ) and be extremely transparent about how stratification was done). +- So dataset object needs to store those specific details OR use fixed values for seed (e.g. to sklearn.train_test_split() ) and be extremely transparent about how stratification was done. - If TRE has enough info to recreate train/test processed data, then they can - - run attacks from script. + - Run attacks from script. - Then the post-processing script - Then make a judgement. ## User Story 3: User provides dataset object but does not use safeXClassifier - In this case we don’t currently have any checking for TRE-approved hyper-parameters or for class disclosure. - But if it is a type where we have a safemodel version, we could create functionality to load it and then check hyper-parameters using existing code - - This raises the issue of whether safeModelClassifiers should have a load() option ?? – I;s currently commented out + - This raises the issue of whether safeModelClassifiers should have a load() option ?? – Is currently commented out - Could also provide method for checking for k-anonymity (and possible pure nodes) where appropriate by refactoring safemodels. - TREs need to manually configure and start scripts to do LIRA, Worst_Case and Attribute_Inference attacks - NB this assumes their classifier outputs probabilities. -## User Story 4 (not implemented yet): User does not use safeXClassifier, or provide dataset object +## User Story 4: User does not use safeXClassifier, or provide dataset object ### but does provide description of pre-processing, ### and provides output probabilities for the train and test set they have used (and true classes?) +#### Status: in progress, still to create the TRE script - We cannot assume that the TRE has the capability to get the right bits of pre-processing code from their source code. - Do we insist on this (would be needed for ‘outside world’)? what if this is commercially sensitive? - TRE can in theory run LIRA and worst-case but not attribute inference attacks. @@ -43,7 +44,8 @@ **THIS would be the version that let people use R ** -## User Story 5 (not implemented yet): User creates differentially private algorithm (not via our code) and provides sufficient details to create data object. +## User Story 5: User creates differentially private algorithm (not via our code) and provides sufficient details to create data object. +#### Status: not implemented yet - How do we know what the actual epsilon value is? - If it is a keras model we can reload and query it if they have stored the training object as part of the model save (we need epochs, dataset size, L2 norm clip, noise values). - But then their stored model probably has disclosive values in anyway … @@ -56,7 +58,8 @@ - Does the actual epsilon value matter if we are doing that? - Yes probably, because it is the sort of thing a TRE may well set as a policy. -## User Story 6 (not implemented yet): Worst Case +## User Story 6: Worst Case +#### Status: not implemented yet - User makes R model for a tree-based classifier that we have not experimented with. - TREs get researcher to provide at minimum the processed train and test files. diff --git a/user_stories/user_story_1/user_story_1_researcher.py b/user_stories/user_story_1/user_story_1_researcher.py index bf50c3b3..f1c3bec4 100644 --- a/user_stories/user_story_1/user_story_1_researcher.py +++ b/user_stories/user_story_1/user_story_1_researcher.py @@ -27,7 +27,7 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals """Create and train a model to be released.""" - directory = "training_artefacts/" + directory = "training_artefacts" print("Creating directory for training artefacts") if not os.path.exists(directory): @@ -37,7 +37,8 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals print("Acting as researcher...") print() - filename = "../user_stories_resources/dataset_26_nursery.csv" + print(os.getcwd()) + filename = os.path.join("..","user_stories_resources","dataset_26_nursery.csv") print("Reading data from " + filename) data = pd.read_csv(filename) diff --git a/user_stories/user_story_2/user_story_2_researcher.py b/user_stories/user_story_2/user_story_2_researcher.py index ed415acc..00757b52 100644 --- a/user_stories/user_story_2/user_story_2_researcher.py +++ b/user_stories/user_story_2/user_story_2_researcher.py @@ -27,7 +27,7 @@ def main(): # pylint: disable=too-many-locals """Create and train a model to be released.""" - directory = "training_artefacts/" + directory = "training_artefacts" print("Creating directory for training artefacts") if not os.path.exists(directory): @@ -36,8 +36,7 @@ def main(): # pylint: disable=too-many-locals print() print("Acting as researcher...") print() - - filename = "../user_stories_resources/dataset_26_nursery.csv" + filename = os.path.join("..","user_stories_resources","dataset_26_nursery.csv") print("Reading data from " + filename) data = pd.read_csv(filename) @@ -108,9 +107,9 @@ def main(): # pylint: disable=too-many-locals # and instead provides only the model and the list of indices # which have been used to split the dataset - print("Saving training/testing indices to ./" + directory) - np.savetxt(directory + "indices_train.txt", indices_train, fmt="%d") - np.savetxt(directory + "indices_test.txt", indices_test, fmt="%d") + print("Saving training/testing indices to " + directory) + np.savetxt(os.path.join(directory,"indices_train.txt"), indices_train, fmt="%d") + np.savetxt(os.path.join(directory,"indices_test.txt"), indices_test, fmt="%d") logging.info("Dataset: %s", target.name) logging.info("Features: %s", target.features) diff --git a/user_stories/user_story_2/user_story_2_tre.py b/user_stories/user_story_2/user_story_2_tre.py index 51396ee2..7fa68689 100644 --- a/user_stories/user_story_2/user_story_2_tre.py +++ b/user_stories/user_story_2/user_story_2_tre.py @@ -13,6 +13,7 @@ import argparse import pickle +import os import numpy as np import pandas as pd @@ -40,18 +41,19 @@ def generate_report( print( "(when instructions on how to recreate the dataset have been provided by the researcher)" ) + print(directory) print() - filename = directory + target_model + filename = os.path.join(directory, target_model) print("Reading target model from " + filename) with open(filename, "rb") as f: target_model = pickle.load(f) - print("Reading training/testing indices from ./" + directory) - indices_train = np.loadtxt(directory + train_indices) - indices_test = np.loadtxt(directory + test_indices) + print("Reading training/testing indices from " + directory) + indices_train = np.loadtxt(os.path.join(directory, train_indices)) + indices_test = np.loadtxt(os.path.join(directory, test_indices)) - filename = "../user_stories_resources/dataset_26_nursery.csv" + filename = os.path.join("..","user_stories_resources","dataset_26_nursery.csv") print("Reading data from " + filename) data = pd.read_csv(filename) @@ -107,13 +109,12 @@ def generate_report( t = GenerateTextReport() t.process_attack_target_json( - directory + attack_results, target_filename=directory + target_filename + os.path.join(directory,attack_results), target_filename=os.path.join(directory,target_filename) ) - t.export_to_file(output_filename=directory + outfile, move_files=True) - - print("Results written to " + directory + outfile) + t.export_to_file(output_filename=os.path.join(directory,outfile), move_files=True) + print("Results written to " + str(os.path.join(directory,outfile))) def main(): """Main method to parse arguments and then invoke report generation.""" @@ -129,7 +130,7 @@ def main(): action="store", dest="training_artefacts_directory", required=False, - default="training_artefacts/", + default="training_artefacts", help=( "Folder containing training artefacts produced by researcher. Default = %(default)s." ), @@ -141,7 +142,7 @@ def main(): action="store", dest="target_model", required=False, - default="/model.pkl", + default="model.pkl", help=("Filename of target model. Default = %(default)s."), ) diff --git a/user_stories/user_story_3/user_story_3_researcher.py b/user_stories/user_story_3/user_story_3_researcher.py index 62493d69..ae3b1b58 100644 --- a/user_stories/user_story_3/user_story_3_researcher.py +++ b/user_stories/user_story_3/user_story_3_researcher.py @@ -31,7 +31,7 @@ if not os.path.exists(directory): os.makedirs(directory) -filename = "../user_stories_resources/dataset_26_nursery.csv" +filename = os.path.join("..","user_stories_resources","dataset_26_nursery.csv") print("Reading data from " + filename) data = pd.read_csv(filename) diff --git a/user_stories/user_story_3/user_story_3_tre.py b/user_stories/user_story_3/user_story_3_tre.py index dcc4f70e..dc875a22 100644 --- a/user_stories/user_story_3/user_story_3_tre.py +++ b/user_stories/user_story_3/user_story_3_tre.py @@ -69,7 +69,7 @@ def generate_report( testy = np.loadtxt(os.path.join(directory, y_test)) target = Target(model=target_model) - # Wrap the training and test data into the Data object + # Wrap the training and test data into the Target object target.add_processed_data(trainX, trainy, testX, testy) # Run the attack diff --git a/user_stories/user_story_7/user_story_7_researcher.py b/user_stories/user_story_7/user_story_7_researcher.py index 2e986105..30bbecd1 100644 --- a/user_stories/user_story_7/user_story_7_researcher.py +++ b/user_stories/user_story_7/user_story_7_researcher.py @@ -27,7 +27,7 @@ def main(): # pylint: disable=too-many-locals """Create and train model to be released.""" - directory = "training_artefacts/" + directory = "training_artefacts" print("Creating directory for training artefacts") if not os.path.exists(directory): @@ -37,7 +37,7 @@ def main(): # pylint: disable=too-many-locals print("Acting as researcher...") print() - filename = "../user_stories_resources/dataset_26_nursery.csv" + filename = os.path.join("..","user_stories_resources","dataset_26_nursery.csv") print("Reading data from " + filename) data = pd.read_csv(filename)