From be2f32b138d28f0c8a500e52f393f0e54336dc9d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 21 Nov 2022 22:54:48 +0000 Subject: [PATCH] eval output + input changes back --- README.md | 6 ++++-- bin/cherri | 26 ++++++++++++++++++++------ bin/find_occupied_regions.py | 14 +++----------- bin/find_trusted_RRI.py | 4 ++-- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index d7d441c..68ffac4 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ export PYTHONHASHSEED=31337 After setting the environment variable, reactivate your environment: ``` conda deactivate -conda acivate cherri +conda activate cherri ``` #### Manual installation @@ -193,7 +193,9 @@ Input parameters for CheRRI's **eval** mode (`cherri eval`): #### Output in evaluation mode At the end of the run the location of the results table is given. -The final results table will have all columns of the input table and an additional prediction column, where you find the predicted class of each RRI (0 or 1). +The final results table will have your the query and target ID's or your input sequences (`target_ID`,`query_ID`), the score of your instance (`instance_score`), the predicted class of each RRI (0 or 1) (`predicted_label`), if you are running the validation mode with `-hf on` the positive or negative label is given (`true_lable`), and finally all features of the instance are provided. + +The Ids are a summary of `chromosme;strand;start;stop` oft the first (target) and the second (query) sequence. Throughout the program, several output files are generated and stored in the following structure: diff --git a/bin/cherri b/bin/cherri index d7978e0..faae591 100644 --- a/bin/cherri +++ b/bin/cherri @@ -198,6 +198,9 @@ def setup_argument_parser(): p_mrg.add_argument("-fh", "--filter_hybrid", default="off", help= "Filter the data for hybrids already detected by ChiRA (set 'on' to filter, default:'off')") + p_mrg.add_argument("-on", "--out_name", + default="non", + help= "Name for the output directory, default 'date_Cherri_evaluating_RRIs' ") @@ -502,15 +505,16 @@ def main_train(args): | ├── test_train_context_50_pos_occ_neg.csv | ├── test_train_context_50_pos_occ_pos.csv | ├── feature_files - | ├── feature_filtered_test_eval_context_150_pos.csv - | ├── feature_filtered_test_eval_context_150_neg.csv - | ├── training_data_test_eval_context_150.npz + | ├── feature_filtered_test_train_context_150_pos.csv + | ├── feature_filtered_test_train_context_150_neg.csv + | ├── training_data_test_train_context_150.npz | ├── model | ├── features | ├── test_train_context_50.npz | ├── optimized | ├── test_train_context_50.model | ├── test_train_context_50.csv + | ├── full_test_train_context_50.model """ args = parser.parse_args() @@ -537,6 +541,8 @@ def main_train(args): n_jobs = args.n_jobs mixed = args.mixed filter_hybrid = args.filter_hybrid + out_name = args.out_name + methods = (f'extra_trees passive_aggressive random_forest sgd ' f'gradient_boosting mlp') @@ -552,7 +558,12 @@ def main_train(args): # define output folder timestr = time.strftime("%Y%m%d") - out_path = f'{out_path}/{timestr}_Cherri_build_model/' + + if out_name == 'non': + out_path = f'{out_path}/{timestr}_Cherri_build_model/' + else: + out_path = f'{out_path}/{out_name}/' + #if set_path == 'off' and mixed == 'off': # if not os.path.exists(out_path): # os.mkdir(out_path) @@ -650,7 +661,8 @@ def main_train(args): midel_name = f'{experiment_name}_context_{str(context)}' X_list = [] y_list = [] - for data in replicates: + # args.list_of_replicates + for data in args.list_of_replicates: feature_path = f'{input_path_RRIs}/{data}/feature_files/' feature_neg = (f'{feature_path}/feature_filtered_{data}_context_' f'{str(context)}_pos_occ_neg.csv') @@ -719,7 +731,9 @@ def main_train(args): opt_call = (f'python -W ignore -m biofilm.biofilm-optimize6 {loaddata} ' f'--memoryMBthread {memoryPerThread} --folds 0 ' f'--out {opt_path}{midel_name} --preprocess True ' - f'--n_jobs {n_jobs} --time {run_time} --methods {methods}') + f'--n_jobs {n_jobs} --time {run_time} --methods {methods}' + # f' --tmp_folder {opt_path}\autosklearn_temp' + ) print('4a. Optimize model\n') #print(opt_call) diff --git a/bin/find_occupied_regions.py b/bin/find_occupied_regions.py index 7c6fe18..6fd914a 100644 --- a/bin/find_occupied_regions.py +++ b/bin/find_occupied_regions.py @@ -196,16 +196,8 @@ def main(): #### Get RRI data by calling find trusted RRI with a very low overlap th of 5% ### only take uniquely mapped reads - ####### Get RRI data - list_rep = [] - for rep in replicates: - list_rep.append(input_path_RRIs + '/' + rep) - replicate_string = ' '.join(list_rep) - print(list_rep) - - - rri_call_param = ('-i ' + 'not_needed' + ' -r ' + replicate_string + + rri_call_param = ('-i ' + input_path_RRIs + ' -r ' + ' '.join(replicates) + ' -o ' + str(overlap_th) +' -n rri_occupied_regions -d ' + out_path + ' -s ' + str(score_th)) if filter_hybrid == 'on': @@ -215,9 +207,9 @@ def main(): rri_file = (out_path + 'rri_occupied_regions_overlap_' + str(overlap_th) + '.csv') - if len(list_rep) == 1: + if len(replicates) == 1: print('Info: only one experiment is used to build occupied regions') - in_file = list_rep[0] + in_file = replicates[0] print(in_file) # df_replicat = rl.read_chira_data(in_file) sep = rl.check_file_type(in_file) diff --git a/bin/find_trusted_RRI.py b/bin/find_trusted_RRI.py index 7a8350c..ea73a7c 100644 --- a/bin/find_trusted_RRI.py +++ b/bin/find_trusted_RRI.py @@ -152,8 +152,8 @@ def build_replicat_library_to_compare(input_path, list_of_replicates, score_th): inter_replicat_list = [] rep_size_list = [] for file in list_of_replicates: - # in_file = input_path + '/' + file - in_file = file + in_file = input_path + '/' + file + #in_file = file df_test = pd.read_table(in_file, sep=',') #print(df_test.info()) sep = rl.check_file_type(in_file)