Merge pull request #47 from BackofenLab/input

Draft changes for galaxy wrapper
BackofenLab · Nov 23, 2022 · ebf4495 · ebf4495
2 parents c1c91db + f9c45bc
commit ebf4495
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 73 deletions.
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ export PYTHONHASHSEED=31337
 After setting the environment variable, reactivate your environment:
 ```
 conda deactivate
-conda acivate cherri
+conda activate cherri
 ```
 
 #### Manual installation
@@ -193,7 +193,9 @@ Input parameters for CheRRI's **eval** mode (`cherri eval`):
 #### Output in evaluation mode
 
 At the end of the run the location of the results table is given.
-The final results table will have all columns of the input table and an additional prediction column, where you find the predicted class of each RRI (0 or 1).
+The final results table will have your the query and target ID's or your input sequences (`target_ID`,`query_ID`), the score of your instance (`instance_score`), the predicted class of each RRI (0 or 1) (`predicted_label`), if you are running the validation mode with `-hf on` the positive or negative label is given (`true_lable`), and finally all features of the instance are provided.
+
+The Ids are a summary of `chromosme;strand;start;stop` oft the first (target) and the second (query) sequence.
 
 Throughout the program, several output files are generated and stored in the following structure:
 
@@ -212,13 +214,13 @@ Throughout the program, several output files are generated and stored in the fol
 
 
 #### Validate your model using the **eval** mode
-You can also use CheRRIs **eval** mode to create a validation result table and than use the [coupute_f1](./scripts/plots/compute_f1.py) to get the F1 score.
+You can also use CheRRIs **eval** mode to create a validation result table and than use the [compute_f1](./scripts/plots/compute_f1.py) to get the F1 score.
 
 In the following is a example call to validate a theoretical model build from DataA
 ```
 cherri eval -i1 /path/to/Model_folder/DataA/feature_files/feature_filtered_<DataA>_context_<150>_pos_occ -g human -l human -o /path/to/Model_folder -n <val_modelA> -c 150 -st on -m  /path/to/Model_folder/DataA/model/optimized/full_<DataA>_context_<150>.model -mp  /path/to/Model_folder/DataA/feature_files/training_data_<DataA>_context_<150>.npz -j 10 -on evaluation -hf on
 ```
-Than use the result file to compute the F1 score using [coupute_f1](./scripts/plots/compute_f1.py).
+Than use the result file to compute the F1 score using [compute_f1](./scripts/plots/compute_f1.py).
 
 ### Build a new CheRRI model in training mode
 

diff --git a/bin/cherri b/bin/cherri
@@ -8,11 +8,13 @@ import sys
 import rrieval.lib as rl
 from scipy.sparse import csr_matrix, vstack, hstack, load_npz, save_npz
 from ubergauss.tools import loadfile, dumpfile
+#import logging
+## DeBug, INFO
 # import subprocess
 
+#logging.basicConfing(filenema='cherri.log'level=logging.DEBUG)
 
-
-__version__ = "0.5"
+__version__ = "0.6"
 
 """
 
@@ -198,7 +200,12 @@ def setup_argument_parser():
     p_mrg.add_argument("-fh", "--filter_hybrid",
                         default="off",
                         help= "Filter the data for hybrids already detected by ChiRA (set 'on' to filter, default:'off')")
-
+    p_mrg.add_argument("-on", "--out_name",
+                        default="non",
+                        help= "Name for the output directory, default 'date_Cherri_evaluating_RRIs' ")
+    p_mrg.add_argument("-tp", "--temp_dir",
+                        default="off",
+                        help= "Set a temporary directory for autosklearn. Either provede a path or 'out' to set it to the output directory Defalut:off")
 
 
     return p
@@ -408,14 +415,14 @@ def main_eval(args):
             os.mkdir(feature_out_path)
 
 
-        file_pos = f'{pos_neg_out_path}{midel_name}pos.csv'
+        pos_data_file = f'{pos_neg_out_path}{midel_name}pos.csv'
         feature_pos = f'{feature_out_path}feature_filtered_{midel_name}_pos.csv'
 
-        if not os.path.isfile(file_pos):
+        if not os.path.isfile(pos_data_file):
             print(f'Error: no positive IntaRNA interaction found')
             sys.exit()
 
-        pos_feature_param = f'-i {file_pos} -f all -o {feature_pos}'
+        pos_feature_param = f'-i {pos_data_file} -f all -o {feature_pos}'
         call_pos_feature = f'get_features.py {pos_feature_param}'
         print('2. Compute features\n')
         #print(call_pos_feature)
@@ -431,9 +438,9 @@ def main_eval(args):
         if not os.path.exists(feature_out_path):
             os.mkdir(feature_out_path)
 
-        pos_data = f'{RRIs_table}_pos.csv'
-        neg_data = f'{RRIs_table}_neg.csv'
-        X, y = rl.read_pos_neg_data(pos_data, neg_data)
+        pos_data_file = f'{RRIs_table}_pos.csv'
+        neg_data_file = f'{RRIs_table}_neg.csv'
+        X, y = rl.read_pos_neg_data(pos_data_file, neg_data_file)
         # X['true_label'] = y
         #äprint(X.info())
 
@@ -456,14 +463,22 @@ def main_eval(args):
         os.mkdir(eval_path)
     print('2b. Classify RRI instances\n')
 
+    # get ID_df
+    #print(pos_data_file)
+    df_ID = rl.get_ids(pos_data_file)
+    #print('***df IDS')
+    #print(df_ID)
+
     #print(X_filterd.info())
     #print(X_filterd.columns.tolist())
     eval_file = f'{eval_path}evaluation_results_{experiment_name}.csv'
 
     if hand_feat == 'off':
-        df_eval = rl.classify(X_filterd, model_file, eval_file)
+        df_eval = rl.classify(X_filterd, model_file, eval_file, df_ID)
     elif hand_feat == 'on':
-        df_eval = rl.classify(X_filterd, model_file, eval_file, True, y)
+        df_eval = rl.classify(X_filterd, model_file, eval_file, df_ID, True, y)
+
+
 
     #print(df_eval)
     print('##########################################')
@@ -494,15 +509,16 @@ def main_train(args):
     |       ├── test_train_context_50_pos_occ_neg.csv
     |       ├── test_train_context_50_pos_occ_pos.csv
     |   ├── feature_files
-    |       ├── feature_filtered_test_eval_context_150_pos.csv
-    |       ├── feature_filtered_test_eval_context_150_neg.csv
-    |       ├── training_data_test_eval_context_150.npz
+    |       ├── feature_filtered_test_train_context_150_pos.csv
+    |       ├── feature_filtered_test_train_context_150_neg.csv
+    |       ├── training_data_test_train_context_150.npz
     |   ├── model
     |       ├── features
     |           ├── test_train_context_50.npz
     |       ├── optimized
     |           ├── test_train_context_50.model
     |           ├── test_train_context_50.csv
+    |           ├── full_test_train_context_50.model
     """
 
     args = parser.parse_args()
@@ -529,6 +545,9 @@ def main_train(args):
     n_jobs = args.n_jobs
     mixed = args.mixed
     filter_hybrid = args.filter_hybrid
+    out_name = args.out_name
+    temp_dir = args.temp_dir
+
 
     methods = (f'extra_trees passive_aggressive random_forest sgd '
                f'gradient_boosting mlp')
@@ -544,7 +563,12 @@ def main_train(args):
 
     # define output folder
     timestr = time.strftime("%Y%m%d")
-    out_path =  f'{out_path}/{timestr}_Cherri_build_model/'
+
+    if out_name == 'non':
+        out_path =  f'{out_path}/{timestr}_Cherri_build_model/'
+    else:
+        out_path =  f'{out_path}/{out_name}/'
+
     #if set_path == 'off' and mixed == 'off':
     #    if not os.path.exists(out_path):
     #        os.mkdir(out_path)
@@ -642,7 +666,8 @@ def main_train(args):
         midel_name =  f'{experiment_name}_context_{str(context)}'
         X_list = []
         y_list = []
-        for data in replicates:
+        # args.list_of_replicates
+        for data in args.list_of_replicates:
             feature_path = f'{input_path_RRIs}/{data}/feature_files/'
             feature_neg = (f'{feature_path}/feature_filtered_{data}_context_'
                            f'{str(context)}_pos_occ_neg.csv')
@@ -708,10 +733,18 @@ def main_train(args):
 
     if use_structure == 'ON' or use_structure == 'on':
         loaddata = f' --infile {feat_output} '
-        opt_call = (f'python -W ignore -m biofilm.biofilm-optimize6 {loaddata} '
+        opt_call = (f'python -W ignore -m biofilm.optimize6 {loaddata} '
                     f'--memoryMBthread {memoryPerThread} --folds 0 '
                     f'--out {opt_path}{midel_name} --preprocess True '
-                    f'--n_jobs {n_jobs} --time {run_time} --methods {methods}')
+                    f'--n_jobs {n_jobs} --time {run_time} --methods {methods}'
+                    # f' --tmp_folder {opt_path}\autosklearn_temp'
+                    )
+        if temp_dir != 'off':
+            if temp_dir == 'out' or temp_dir == 'OUT':
+                opt_call = opt_call + f' --tmp_folder {opt_path}/autosklearn_temp'
+            else:
+                opt_call = opt_call + f' --tmp_folder {temp_dir}/autosklearn_temp'
+
 
         print('4a. Optimize model\n')
         #print(opt_call)
@@ -728,10 +761,17 @@ def main_train(args):
         rl.call_script(call_refit)
     elif use_structure == 'OFF' or use_structure == 'off':
         loaddata = f' --infile {feat_output}  --featurefile {out_feat}'
-        opt_call = (f'python -W ignore -m biofilm.biofilm-optimize6 {loaddata} '
+        opt_call = (f'python -W ignore -m biofilm.optimize6 {loaddata} '
                     f'--memoryMBthread {memoryPerThread} --folds 0 '
                     f'--out {opt_path}{midel_name} --preprocess True '
                     f'--n_jobs {n_jobs} --time {run_time} --methods {methods}')
+        if temp_dir != 'off':
+            if temp_dir == 'out' or temp_dir == 'OUT':
+                opt_call= f'{opt_call} --tmp_folder {opt_path}\autosklearn_temp'
+            else:
+                opt_call= f'{opt_call} --tmp_folder {temp_dir}\autosklearn_temp'
+
+
         print('4b. Optimize model\n')
         #print(opt_call)
         out = rl.call_script(opt_call, reprot_stdout=True,asset_err=False)

diff --git a/bin/find_occupied_regions.py b/bin/find_occupied_regions.py
@@ -196,8 +196,8 @@ def main():
     #### Get RRI data by calling find trusted RRI with a very low overlap th of 5%
     ### only take uniquely mapped reads
 
-    ####### Get RRI data
-    rri_call_param = ('-i ' + input_path_RRIs + ' -r ' + ' '.join(replicates) +
+
+    rri_call_param = ('-i ' +  input_path_RRIs + ' -r ' + ' '.join(replicates) +
                      ' -o ' + str(overlap_th) +' -n rri_occupied_regions -d ' +
                      out_path + ' -s ' +  str(score_th))
     if filter_hybrid == 'on':
@@ -209,7 +209,7 @@ def main():
 
     if len(replicates) == 1:
         print('Info: only one experiment is used to build occupied regions')
-        in_file = input_path_RRIs + replicates[0]
+        in_file = replicates[0]
         print(in_file)
         # df_replicat = rl.read_chira_data(in_file)
         sep = rl.check_file_type(in_file)

diff --git a/bin/find_trusted_RRI.py b/bin/find_trusted_RRI.py
@@ -2,8 +2,6 @@
 import pandas as pd
 import math
 import numpy as np
-#import matplotlib as mpl
-#import matplotlib.pyplot as plt
 from collections import defaultdict
 from interlap import InterLap
 import sys
@@ -155,6 +153,7 @@ def build_replicat_library_to_compare(input_path, list_of_replicates, score_th):
     rep_size_list = []
     for file in list_of_replicates:
         in_file = input_path + '/' + file
+        #in_file = file
         df_test = pd.read_table(in_file, sep=',')
         #print(df_test.info())
         sep = rl.check_file_type(in_file)
@@ -901,28 +900,6 @@ def main():
     df_final_output.to_csv(output_path + output_name, index=False)
 
 
-    #### Plotting ######
-
-    #histogrom energy
-    #fig1 = plt.figure()
-    #bins = np.arange(min(energy_list), max(energy_list), 5)
-
-    #plt.hist(energy_list, bins=bins)
-    #fig1.savefig(plot_path + "histogram_energy.pdf", bbox_inches='tight')
-
-    #seq1_len_list = [len[0] for len in interaction_length_also_nan]
-    #seq2_len_list = [len[1] for len in interaction_length_also_nan]
-
-    #d = {'rri_seq1': seq1_len_list, 'rri_seq2': seq2_len_list}
-    #df_rri_len = pd.DataFrame(data=d)
-
-    #myFig = plt.figure()
-    #boxplot = df_rri_len.boxplot(column=['rri_seq1', 'rri_seq2'])
-
-    #myFig.savefig(plot_path + "boxplot_rri_len_seq.pdf", bbox_inches='tight')
-
-    # input_path = '/home/teresa/Dokumente/RNA_RNA_interaction_evaluation/RNA_RNA_binding_evaluation/data/training/Paris/'
-    # list_of_replicates = ['test_rep1.tabular', 'test_rep2.tabular', 'test_rep3.tabular']
 
 if __name__ == '__main__':
     main()
diff --git a/bin/generate_pos_neg_with_context.py b/bin/generate_pos_neg_with_context.py
@@ -317,8 +317,11 @@ def join_result_and_infos(df, lost_inst, row, list_rows_add):
     else:
         val = {}
         # information of RRI is prepared to attach to predictions by IntaRNA
+        #print('*** row: ***')
+        #print(row)
         for col_n in list_rows_add:
             #print(col_n)
+            #print()
             val[col_n] = [row[col_n]]*len(df)
 
         df_append = pd.DataFrame.from_dict(val)
@@ -541,7 +544,12 @@ def get_context_added(input_rris, output_path, genome_file, context, context_not
     df_contex['target_key'] =  df_contex['chrom_1st'].astype(str)+ ';' + df_contex['strand_1st'].astype(str)
     df_contex['query_key'] =  df_contex['chrom_2end'].astype(str)+ ';' + df_contex['strand_2end'].astype(str)
 
+    df_contex['target_ID'] =  df_contex['target_key'].astype(str)+ ';' + df_contex['start_1st'].astype(str) + ';' + df_contex['end_1st'].astype(str)
+    df_contex['query_ID'] =  df_contex['query_key'].astype(str)+ ';' + df_contex['start_2end'].astype(str)+ ';' + df_contex['end_2end'].astype(str)
+
+
     df_contex.to_csv(context_file, index=False)
+    # print(df_contex.info())
     return df_contex
 
 
@@ -746,7 +754,8 @@ def get_header():
                      'biotype_region_1st', 'biotype_region_2end',
                      'ID_1st','ID_2end','con_target','con_query',
                      'target_con_s','target_con_e','query_con_s',
-                     'query_con_e', 'target_key', 'query_key']
+                     'query_con_e', 'target_key', 'query_key',
+                     'target_ID', 'query_ID']
     intaRNA_col_name = 'id1,start1,end1,id2,start2,end2,subseqDP,hybridDP,E,seedStart1,seedEnd1,seedStart2,seedEnd2,seedE,E_hybrid,ED1,ED2'
     list_intaRNA_col_name = intaRNA_col_name.split(',')
     header = list_intaRNA_col_name + list_rows_add
@@ -787,7 +796,7 @@ def main():
                         help= "IntaRNA parameter file",
                         default="./IntaRNA_param.txt")
     parser.add_argument("-m", "--mode",
-                        help= "which Cherri mode is running",
+                        help= "which CheRRI mode is running",
                         default="eval")
 
 
@@ -851,18 +860,6 @@ def main():
     occupied_InteLab = rl.load_occupied_data(input_occupied)
 
 
-
-        #chrom_dict = rl.read_table_into_dic(chrom_len_file)
-        #occupied_InteLab = defaultdict(InterLap)
-
-        #for i in chrom_dict:
-            #chrom = rl.check_convert_chr_id(i)
-            #if chrom != False:
-                #occupied_InteLab[str(chrom) + ';+'].add((0, 0, ['empty']))
-                #occupied_InteLab[str(chrom) + ';-'].add((0, 0, ['empty']))
-        #print(occupied_InteLab)
-
-
     # Reporting how many instances did not lead to a result
     context_not_full = 0
     lost_inst_pos = 0