diff --git a/group_to_GO_abundances.sh b/group_to_GO_abundances.sh index 7cb1c69..cf18a12 100755 --- a/group_to_GO_abundances.sh +++ b/group_to_GO_abundances.sh @@ -14,7 +14,9 @@ function shortUsage() { echo "Required options:" prettyOpt "-i" "Path to file with UniRef50 gene family abundance (HUMAnN2 output)" - prettyOpt "-o" "Path to output file which will contain GO slim term abudances" + prettyOpt "-m" "Path to file which will contain GO slim term abudances corresponding to molecular functions" + prettyOpt "-b" "Path to file which will contain GO slim term abudances corresponding to biological processes" + prettyOpt "-c" "Path to file which will contain GO slim term abudances corresponding to cellular components" echo echo "Other options:" prettyOpt "-a" "Path to basic Gene Ontology file" @@ -32,7 +34,9 @@ function shortUsage() { MY_PATH=`dirname "$0"` input_file="" -output_file="" +molecular_function_file="" +biological_processes_file="" +cellular_component_file="" go_file=$MY_PATH"/data/go.obo" slim_go_file=$MY_PATH"/data/goslim_metagenomics.obo" humann2_uniref_go=$MY_PATH"/data/map_infogo1000_uniref50.txt" @@ -40,13 +44,19 @@ goatools_path="goatools/scripts/" humann2_path="/usr/bin/" # Manage arguments -while getopts ":i:o:a:s:u:g:p:h" opt; do +while getopts ":i:m:b:c:a:s:u:g:p:h" opt; do case $opt in i) input_file=$OPTARG >&2 ;; - o) - output_file=$OPTARG >&2 + m) + molecular_function_file=$OPTARG >&2 + ;; + b) + biological_processes_file=$OPTARG >&2 + ;; + c) + cellular_component_file=$OPTARG >&2 ;; a) go_file=$OPTARG >&2 @@ -84,8 +94,18 @@ if [ -z $input_file ]; then shortUsage; fi -if [ -z $output_file ]; then - msg_error "Missing argument: -o" +if [ -z $molecular_function_file ]; then + msg_error "Missing argument: -m" + shortUsage; +fi + +if [ -z $biological_processes_file ]; then + msg_error "Missing argument: -b" + shortUsage; +fi + +if [ -z $cellular_component_file ]; then + msg_error "Missing argument: -c" shortUsage; fi @@ -144,7 +164,9 @@ echo "========================" python $MY_PATH"/src/format_humann2_output.py" \ --go_slim $slim_go_file \ --humann2_output $tmp_data_dir"/humann2_slim_go_abundances.txt" \ - --formatted_humann2_output $output_file + --molecular_function_output_file $molecular_function_file \ + --biological_processes_output_file $biological_processes_file \ + --cellular_component_output_file $cellular_component_file echo "" #rm -rf $tmp_data_dir diff --git a/src/format_humann2_output.py b/src/format_humann2_output.py index 91eb652..502d427 100644 --- a/src/format_humann2_output.py +++ b/src/format_humann2_output.py @@ -49,28 +49,41 @@ def extract_go_slim_annotations(args): def format_humann2_output(args, go_annotations): with open(args.humann2_output, "r") as humann2_output: - with open(args.formatted_humann2_output,"w") as formatted_humann2_output: - formatted_humann2_output.write("GO id\tGO name\tGO namespace\tAbundance\n") + output_files = {} + output_files['molecular_function'] = open(args.molecular_function_output_file,"w") + output_files['molecular_function'].write("GO id\tGO name\tAbundance\n") - for line in humann2_output.readlines()[1:]: - split_line = line[:-1].split('\t') - go_id = split_line[0] - abundance = split_line[1] + output_files['biological_process'] = open(args.biological_processes_output_file,"w") + output_files['biological_process'].write("GO id\tGO name\tAbundance\n") - if not go_annotations.has_key(go_id): - string = go_id + " has not found annotations" - raise ValueError(string) + output_files['cellular_component'] = open(args.cellular_component_output_file,"w") + output_files['cellular_component'].write("GO id\tGO name\tAbundance\n") - formatted_humann2_output.write(go_id + '\t') - formatted_humann2_output.write(go_annotations[go_id]["name"] + '\t') - formatted_humann2_output.write(go_annotations[go_id]["namespace"] + '\t') - formatted_humann2_output.write(abundance + '\n') + for line in humann2_output.readlines()[1:]: + split_line = line[:-1].split('\t') + go_id = split_line[0] + abundance = split_line[1] + namespace = go_annotations[go_id]["namespace"] + + if not go_annotations.has_key(go_id): + string = go_id + " has not found annotations" + raise ValueError(string) + + output_files[namespace].write(go_id + '\t') + output_files[namespace].write(go_annotations[go_id]["name"] + '\t') + output_files[namespace].write(abundance + '\n') + + output_files['molecular_function'].close() + output_files['biological_process'].close() + output_files['cellular_component'].close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--go_slim', required=True) parser.add_argument('--humann2_output', required=True) - parser.add_argument('--formatted_humann2_output', required=True) + parser.add_argument('--molecular_function_output_file', required=True) + parser.add_argument('--biological_processes_output_file', required=True) + parser.add_argument('--cellular_component_output_file', required=True) args = parser.parse_args() go_annotations = extract_go_slim_annotations(args)