eval output + input changes back

BackofenLab · Nov 21, 2022 · be2f32b · be2f32b
1 parent 12a3954
commit be2f32b
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ export PYTHONHASHSEED=31337
 After setting the environment variable, reactivate your environment:
 ```
 conda deactivate
-conda acivate cherri
+conda activate cherri
 ```
 
 #### Manual installation
@@ -193,7 +193,9 @@ Input parameters for CheRRI's **eval** mode (`cherri eval`):
 #### Output in evaluation mode
 
 At the end of the run the location of the results table is given.
-The final results table will have all columns of the input table and an additional prediction column, where you find the predicted class of each RRI (0 or 1).
+The final results table will have your the query and target ID's or your input sequences (`target_ID`,`query_ID`), the score of your instance (`instance_score`), the predicted class of each RRI (0 or 1) (`predicted_label`), if you are running the validation mode with `-hf on` the positive or negative label is given (`true_lable`), and finally all features of the instance are provided.
+
+The Ids are a summary of `chromosme;strand;start;stop` oft the first (target) and the second (query) sequence.
 
 Throughout the program, several output files are generated and stored in the following structure:
 

diff --git a/bin/cherri b/bin/cherri
@@ -198,6 +198,9 @@ def setup_argument_parser():
     p_mrg.add_argument("-fh", "--filter_hybrid",
                         default="off",
                         help= "Filter the data for hybrids already detected by ChiRA (set 'on' to filter, default:'off')")
+    p_mrg.add_argument("-on", "--out_name",
+                        default="non",
+                        help= "Name for the output directory, default 'date_Cherri_evaluating_RRIs' ")
 
 
 
@@ -502,15 +505,16 @@ def main_train(args):
     |       ├── test_train_context_50_pos_occ_neg.csv
     |       ├── test_train_context_50_pos_occ_pos.csv
     |   ├── feature_files
-    |       ├── feature_filtered_test_eval_context_150_pos.csv
-    |       ├── feature_filtered_test_eval_context_150_neg.csv
-    |       ├── training_data_test_eval_context_150.npz
+    |       ├── feature_filtered_test_train_context_150_pos.csv
+    |       ├── feature_filtered_test_train_context_150_neg.csv
+    |       ├── training_data_test_train_context_150.npz
     |   ├── model
     |       ├── features
     |           ├── test_train_context_50.npz
     |       ├── optimized
     |           ├── test_train_context_50.model
     |           ├── test_train_context_50.csv
+    |           ├── full_test_train_context_50.model
     """
 
     args = parser.parse_args()
@@ -537,6 +541,8 @@ def main_train(args):
     n_jobs = args.n_jobs
     mixed = args.mixed
     filter_hybrid = args.filter_hybrid
+    out_name = args.out_name
+
 
     methods = (f'extra_trees passive_aggressive random_forest sgd '
                f'gradient_boosting mlp')
@@ -552,7 +558,12 @@ def main_train(args):
 
     # define output folder
     timestr = time.strftime("%Y%m%d")
-    out_path =  f'{out_path}/{timestr}_Cherri_build_model/'
+
+    if out_name == 'non':
+        out_path =  f'{out_path}/{timestr}_Cherri_build_model/'
+    else:
+        out_path =  f'{out_path}/{out_name}/'
+
     #if set_path == 'off' and mixed == 'off':
     #    if not os.path.exists(out_path):
     #        os.mkdir(out_path)
@@ -650,7 +661,8 @@ def main_train(args):
         midel_name =  f'{experiment_name}_context_{str(context)}'
         X_list = []
         y_list = []
-        for data in replicates:
+        # args.list_of_replicates
+        for data in args.list_of_replicates:
             feature_path = f'{input_path_RRIs}/{data}/feature_files/'
             feature_neg = (f'{feature_path}/feature_filtered_{data}_context_'
                            f'{str(context)}_pos_occ_neg.csv')
@@ -719,7 +731,9 @@ def main_train(args):
         opt_call = (f'python -W ignore -m biofilm.biofilm-optimize6 {loaddata} '
                     f'--memoryMBthread {memoryPerThread} --folds 0 '
                     f'--out {opt_path}{midel_name} --preprocess True '
-                    f'--n_jobs {n_jobs} --time {run_time} --methods {methods}')
+                    f'--n_jobs {n_jobs} --time {run_time} --methods {methods}'
+                    # f' --tmp_folder {opt_path}\autosklearn_temp'
+                    )
 
         print('4a. Optimize model\n')
         #print(opt_call)

diff --git a/bin/find_occupied_regions.py b/bin/find_occupied_regions.py
@@ -196,16 +196,8 @@ def main():
     #### Get RRI data by calling find trusted RRI with a very low overlap th of 5%
     ### only take uniquely mapped reads
 
-    ####### Get RRI data
-    list_rep = []
-    for rep in replicates:
-        list_rep.append(input_path_RRIs + '/' + rep)
-    replicate_string = ' '.join(list_rep)
 
-    print(list_rep)
-
-
-    rri_call_param = ('-i ' + 'not_needed' + ' -r ' + replicate_string +
+    rri_call_param = ('-i ' +  input_path_RRIs + ' -r ' + ' '.join(replicates) +
                      ' -o ' + str(overlap_th) +' -n rri_occupied_regions -d ' +
                      out_path + ' -s ' +  str(score_th))
     if filter_hybrid == 'on':
@@ -215,9 +207,9 @@ def main():
     rri_file = (out_path + 'rri_occupied_regions_overlap_' +
                 str(overlap_th) + '.csv')
 
-    if len(list_rep) == 1:
+    if len(replicates) == 1:
         print('Info: only one experiment is used to build occupied regions')
-        in_file = list_rep[0]
+        in_file = replicates[0]
         print(in_file)
         # df_replicat = rl.read_chira_data(in_file)
         sep = rl.check_file_type(in_file)

diff --git a/bin/find_trusted_RRI.py b/bin/find_trusted_RRI.py
@@ -152,8 +152,8 @@ def build_replicat_library_to_compare(input_path, list_of_replicates, score_th):
     inter_replicat_list = []
     rep_size_list = []
     for file in list_of_replicates:
-        # in_file = input_path + '/' + file
-        in_file = file
+        in_file = input_path + '/' + file
+        #in_file = file
         df_test = pd.read_table(in_file, sep=',')
         #print(df_test.info())
         sep = rl.check_file_type(in_file)