Skip to content

Commit

Permalink
Merge pull request #47 from BackofenLab/input
Browse files Browse the repository at this point in the history
Draft changes for galaxy wrapper
  • Loading branch information
teresa-m authored Nov 23, 2022
2 parents c1c91db + f9c45bc commit ebf4495
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 73 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ export PYTHONHASHSEED=31337
After setting the environment variable, reactivate your environment:
```
conda deactivate
conda acivate cherri
conda activate cherri
```

#### Manual installation
Expand Down Expand Up @@ -193,7 +193,9 @@ Input parameters for CheRRI's **eval** mode (`cherri eval`):
#### Output in evaluation mode

At the end of the run the location of the results table is given.
The final results table will have all columns of the input table and an additional prediction column, where you find the predicted class of each RRI (0 or 1).
The final results table will have your the query and target ID's or your input sequences (`target_ID`,`query_ID`), the score of your instance (`instance_score`), the predicted class of each RRI (0 or 1) (`predicted_label`), if you are running the validation mode with `-hf on` the positive or negative label is given (`true_lable`), and finally all features of the instance are provided.

The Ids are a summary of `chromosme;strand;start;stop` oft the first (target) and the second (query) sequence.

Throughout the program, several output files are generated and stored in the following structure:

Expand All @@ -212,13 +214,13 @@ Throughout the program, several output files are generated and stored in the fol


#### Validate your model using the **eval** mode
You can also use CheRRIs **eval** mode to create a validation result table and than use the [coupute_f1](./scripts/plots/compute_f1.py) to get the F1 score.
You can also use CheRRIs **eval** mode to create a validation result table and than use the [compute_f1](./scripts/plots/compute_f1.py) to get the F1 score.

In the following is a example call to validate a theoretical model build from DataA
```
cherri eval -i1 /path/to/Model_folder/DataA/feature_files/feature_filtered_<DataA>_context_<150>_pos_occ -g human -l human -o /path/to/Model_folder -n <val_modelA> -c 150 -st on -m /path/to/Model_folder/DataA/model/optimized/full_<DataA>_context_<150>.model -mp /path/to/Model_folder/DataA/feature_files/training_data_<DataA>_context_<150>.npz -j 10 -on evaluation -hf on
```
Than use the result file to compute the F1 score using [coupute_f1](./scripts/plots/compute_f1.py).
Than use the result file to compute the F1 score using [compute_f1](./scripts/plots/compute_f1.py).

### Build a new CheRRI model in training mode

Expand Down
78 changes: 59 additions & 19 deletions bin/cherri
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ import sys
import rrieval.lib as rl
from scipy.sparse import csr_matrix, vstack, hstack, load_npz, save_npz
from ubergauss.tools import loadfile, dumpfile
#import logging
## DeBug, INFO
# import subprocess

#logging.basicConfing(filenema='cherri.log'level=logging.DEBUG)


__version__ = "0.5"
__version__ = "0.6"

"""
Expand Down Expand Up @@ -198,7 +200,12 @@ def setup_argument_parser():
p_mrg.add_argument("-fh", "--filter_hybrid",
default="off",
help= "Filter the data for hybrids already detected by ChiRA (set 'on' to filter, default:'off')")

p_mrg.add_argument("-on", "--out_name",
default="non",
help= "Name for the output directory, default 'date_Cherri_evaluating_RRIs' ")
p_mrg.add_argument("-tp", "--temp_dir",
default="off",
help= "Set a temporary directory for autosklearn. Either provede a path or 'out' to set it to the output directory Defalut:off")


return p
Expand Down Expand Up @@ -408,14 +415,14 @@ def main_eval(args):
os.mkdir(feature_out_path)


file_pos = f'{pos_neg_out_path}{midel_name}pos.csv'
pos_data_file = f'{pos_neg_out_path}{midel_name}pos.csv'
feature_pos = f'{feature_out_path}feature_filtered_{midel_name}_pos.csv'

if not os.path.isfile(file_pos):
if not os.path.isfile(pos_data_file):
print(f'Error: no positive IntaRNA interaction found')
sys.exit()

pos_feature_param = f'-i {file_pos} -f all -o {feature_pos}'
pos_feature_param = f'-i {pos_data_file} -f all -o {feature_pos}'
call_pos_feature = f'get_features.py {pos_feature_param}'
print('2. Compute features\n')
#print(call_pos_feature)
Expand All @@ -431,9 +438,9 @@ def main_eval(args):
if not os.path.exists(feature_out_path):
os.mkdir(feature_out_path)

pos_data = f'{RRIs_table}_pos.csv'
neg_data = f'{RRIs_table}_neg.csv'
X, y = rl.read_pos_neg_data(pos_data, neg_data)
pos_data_file = f'{RRIs_table}_pos.csv'
neg_data_file = f'{RRIs_table}_neg.csv'
X, y = rl.read_pos_neg_data(pos_data_file, neg_data_file)
# X['true_label'] = y
#äprint(X.info())

Expand All @@ -456,14 +463,22 @@ def main_eval(args):
os.mkdir(eval_path)
print('2b. Classify RRI instances\n')

# get ID_df
#print(pos_data_file)
df_ID = rl.get_ids(pos_data_file)
#print('***df IDS')
#print(df_ID)

#print(X_filterd.info())
#print(X_filterd.columns.tolist())
eval_file = f'{eval_path}evaluation_results_{experiment_name}.csv'

if hand_feat == 'off':
df_eval = rl.classify(X_filterd, model_file, eval_file)
df_eval = rl.classify(X_filterd, model_file, eval_file, df_ID)
elif hand_feat == 'on':
df_eval = rl.classify(X_filterd, model_file, eval_file, True, y)
df_eval = rl.classify(X_filterd, model_file, eval_file, df_ID, True, y)



#print(df_eval)
print('##########################################')
Expand Down Expand Up @@ -494,15 +509,16 @@ def main_train(args):
| ├── test_train_context_50_pos_occ_neg.csv
| ├── test_train_context_50_pos_occ_pos.csv
| ├── feature_files
| ├── feature_filtered_test_eval_context_150_pos.csv
| ├── feature_filtered_test_eval_context_150_neg.csv
| ├── training_data_test_eval_context_150.npz
| ├── feature_filtered_test_train_context_150_pos.csv
| ├── feature_filtered_test_train_context_150_neg.csv
| ├── training_data_test_train_context_150.npz
| ├── model
| ├── features
| ├── test_train_context_50.npz
| ├── optimized
| ├── test_train_context_50.model
| ├── test_train_context_50.csv
| ├── full_test_train_context_50.model
"""

args = parser.parse_args()
Expand All @@ -529,6 +545,9 @@ def main_train(args):
n_jobs = args.n_jobs
mixed = args.mixed
filter_hybrid = args.filter_hybrid
out_name = args.out_name
temp_dir = args.temp_dir


methods = (f'extra_trees passive_aggressive random_forest sgd '
f'gradient_boosting mlp')
Expand All @@ -544,7 +563,12 @@ def main_train(args):

# define output folder
timestr = time.strftime("%Y%m%d")
out_path = f'{out_path}/{timestr}_Cherri_build_model/'

if out_name == 'non':
out_path = f'{out_path}/{timestr}_Cherri_build_model/'
else:
out_path = f'{out_path}/{out_name}/'

#if set_path == 'off' and mixed == 'off':
# if not os.path.exists(out_path):
# os.mkdir(out_path)
Expand Down Expand Up @@ -642,7 +666,8 @@ def main_train(args):
midel_name = f'{experiment_name}_context_{str(context)}'
X_list = []
y_list = []
for data in replicates:
# args.list_of_replicates
for data in args.list_of_replicates:
feature_path = f'{input_path_RRIs}/{data}/feature_files/'
feature_neg = (f'{feature_path}/feature_filtered_{data}_context_'
f'{str(context)}_pos_occ_neg.csv')
Expand Down Expand Up @@ -708,10 +733,18 @@ def main_train(args):

if use_structure == 'ON' or use_structure == 'on':
loaddata = f' --infile {feat_output} '
opt_call = (f'python -W ignore -m biofilm.biofilm-optimize6 {loaddata} '
opt_call = (f'python -W ignore -m biofilm.optimize6 {loaddata} '
f'--memoryMBthread {memoryPerThread} --folds 0 '
f'--out {opt_path}{midel_name} --preprocess True '
f'--n_jobs {n_jobs} --time {run_time} --methods {methods}')
f'--n_jobs {n_jobs} --time {run_time} --methods {methods}'
# f' --tmp_folder {opt_path}\autosklearn_temp'
)
if temp_dir != 'off':
if temp_dir == 'out' or temp_dir == 'OUT':
opt_call = opt_call + f' --tmp_folder {opt_path}/autosklearn_temp'
else:
opt_call = opt_call + f' --tmp_folder {temp_dir}/autosklearn_temp'


print('4a. Optimize model\n')
#print(opt_call)
Expand All @@ -728,10 +761,17 @@ def main_train(args):
rl.call_script(call_refit)
elif use_structure == 'OFF' or use_structure == 'off':
loaddata = f' --infile {feat_output} --featurefile {out_feat}'
opt_call = (f'python -W ignore -m biofilm.biofilm-optimize6 {loaddata} '
opt_call = (f'python -W ignore -m biofilm.optimize6 {loaddata} '
f'--memoryMBthread {memoryPerThread} --folds 0 '
f'--out {opt_path}{midel_name} --preprocess True '
f'--n_jobs {n_jobs} --time {run_time} --methods {methods}')
if temp_dir != 'off':
if temp_dir == 'out' or temp_dir == 'OUT':
opt_call= f'{opt_call} --tmp_folder {opt_path}\autosklearn_temp'
else:
opt_call= f'{opt_call} --tmp_folder {temp_dir}\autosklearn_temp'


print('4b. Optimize model\n')
#print(opt_call)
out = rl.call_script(opt_call, reprot_stdout=True,asset_err=False)
Expand Down
6 changes: 3 additions & 3 deletions bin/find_occupied_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ def main():
#### Get RRI data by calling find trusted RRI with a very low overlap th of 5%
### only take uniquely mapped reads

####### Get RRI data
rri_call_param = ('-i ' + input_path_RRIs + ' -r ' + ' '.join(replicates) +

rri_call_param = ('-i ' + input_path_RRIs + ' -r ' + ' '.join(replicates) +
' -o ' + str(overlap_th) +' -n rri_occupied_regions -d ' +
out_path + ' -s ' + str(score_th))
if filter_hybrid == 'on':
Expand All @@ -209,7 +209,7 @@ def main():

if len(replicates) == 1:
print('Info: only one experiment is used to build occupied regions')
in_file = input_path_RRIs + replicates[0]
in_file = replicates[0]
print(in_file)
# df_replicat = rl.read_chira_data(in_file)
sep = rl.check_file_type(in_file)
Expand Down
25 changes: 1 addition & 24 deletions bin/find_trusted_RRI.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import pandas as pd
import math
import numpy as np
#import matplotlib as mpl
#import matplotlib.pyplot as plt
from collections import defaultdict
from interlap import InterLap
import sys
Expand Down Expand Up @@ -155,6 +153,7 @@ def build_replicat_library_to_compare(input_path, list_of_replicates, score_th):
rep_size_list = []
for file in list_of_replicates:
in_file = input_path + '/' + file
#in_file = file
df_test = pd.read_table(in_file, sep=',')
#print(df_test.info())
sep = rl.check_file_type(in_file)
Expand Down Expand Up @@ -901,28 +900,6 @@ def main():
df_final_output.to_csv(output_path + output_name, index=False)


#### Plotting ######

#histogrom energy
#fig1 = plt.figure()
#bins = np.arange(min(energy_list), max(energy_list), 5)

#plt.hist(energy_list, bins=bins)
#fig1.savefig(plot_path + "histogram_energy.pdf", bbox_inches='tight')

#seq1_len_list = [len[0] for len in interaction_length_also_nan]
#seq2_len_list = [len[1] for len in interaction_length_also_nan]

#d = {'rri_seq1': seq1_len_list, 'rri_seq2': seq2_len_list}
#df_rri_len = pd.DataFrame(data=d)

#myFig = plt.figure()
#boxplot = df_rri_len.boxplot(column=['rri_seq1', 'rri_seq2'])

#myFig.savefig(plot_path + "boxplot_rri_len_seq.pdf", bbox_inches='tight')

# input_path = '/home/teresa/Dokumente/RNA_RNA_interaction_evaluation/RNA_RNA_binding_evaluation/data/training/Paris/'
# list_of_replicates = ['test_rep1.tabular', 'test_rep2.tabular', 'test_rep3.tabular']

if __name__ == '__main__':
main()
25 changes: 11 additions & 14 deletions bin/generate_pos_neg_with_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,11 @@ def join_result_and_infos(df, lost_inst, row, list_rows_add):
else:
val = {}
# information of RRI is prepared to attach to predictions by IntaRNA
#print('*** row: ***')
#print(row)
for col_n in list_rows_add:
#print(col_n)
#print()
val[col_n] = [row[col_n]]*len(df)

df_append = pd.DataFrame.from_dict(val)
Expand Down Expand Up @@ -541,7 +544,12 @@ def get_context_added(input_rris, output_path, genome_file, context, context_not
df_contex['target_key'] = df_contex['chrom_1st'].astype(str)+ ';' + df_contex['strand_1st'].astype(str)
df_contex['query_key'] = df_contex['chrom_2end'].astype(str)+ ';' + df_contex['strand_2end'].astype(str)

df_contex['target_ID'] = df_contex['target_key'].astype(str)+ ';' + df_contex['start_1st'].astype(str) + ';' + df_contex['end_1st'].astype(str)
df_contex['query_ID'] = df_contex['query_key'].astype(str)+ ';' + df_contex['start_2end'].astype(str)+ ';' + df_contex['end_2end'].astype(str)


df_contex.to_csv(context_file, index=False)
# print(df_contex.info())
return df_contex


Expand Down Expand Up @@ -746,7 +754,8 @@ def get_header():
'biotype_region_1st', 'biotype_region_2end',
'ID_1st','ID_2end','con_target','con_query',
'target_con_s','target_con_e','query_con_s',
'query_con_e', 'target_key', 'query_key']
'query_con_e', 'target_key', 'query_key',
'target_ID', 'query_ID']
intaRNA_col_name = 'id1,start1,end1,id2,start2,end2,subseqDP,hybridDP,E,seedStart1,seedEnd1,seedStart2,seedEnd2,seedE,E_hybrid,ED1,ED2'
list_intaRNA_col_name = intaRNA_col_name.split(',')
header = list_intaRNA_col_name + list_rows_add
Expand Down Expand Up @@ -787,7 +796,7 @@ def main():
help= "IntaRNA parameter file",
default="./IntaRNA_param.txt")
parser.add_argument("-m", "--mode",
help= "which Cherri mode is running",
help= "which CheRRI mode is running",
default="eval")


Expand Down Expand Up @@ -851,18 +860,6 @@ def main():
occupied_InteLab = rl.load_occupied_data(input_occupied)



#chrom_dict = rl.read_table_into_dic(chrom_len_file)
#occupied_InteLab = defaultdict(InterLap)

#for i in chrom_dict:
#chrom = rl.check_convert_chr_id(i)
#if chrom != False:
#occupied_InteLab[str(chrom) + ';+'].add((0, 0, ['empty']))
#occupied_InteLab[str(chrom) + ';-'].add((0, 0, ['empty']))
#print(occupied_InteLab)


# Reporting how many instances did not lead to a result
context_not_full = 0
lost_inst_pos = 0
Expand Down
Loading

0 comments on commit ebf4495

Please sign in to comment.