Split output file into 3 files (molecular, biological and cellular)

ASaiM · Feb 22, 2016 · 54e5cab · 54e5cab
1 parent 4828933
commit 54e5cab
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 22 deletions.
diff --git a/group_to_GO_abundances.sh b/group_to_GO_abundances.sh
@@ -14,7 +14,9 @@ function shortUsage() {
 
     echo "Required options:"
     prettyOpt "-i" "Path to file with UniRef50 gene family abundance (HUMAnN2 output)"
-    prettyOpt "-o" "Path to output file which will contain GO slim term abudances"
+    prettyOpt "-m" "Path to file which will contain GO slim term abudances corresponding to molecular functions"
+    prettyOpt "-b" "Path to file which will contain GO slim term abudances corresponding to biological processes"
+    prettyOpt "-c" "Path to file which will contain GO slim term abudances corresponding to cellular components"
     echo
     echo "Other options:"
     prettyOpt "-a" "Path to basic Gene Ontology file"
@@ -32,21 +34,29 @@ function shortUsage() {
 MY_PATH=`dirname "$0"`
 
 input_file=""
-output_file=""
+molecular_function_file=""
+biological_processes_file=""
+cellular_component_file=""
 go_file=$MY_PATH"/data/go.obo"
 slim_go_file=$MY_PATH"/data/goslim_metagenomics.obo"
 humann2_uniref_go=$MY_PATH"/data/map_infogo1000_uniref50.txt"
 goatools_path="goatools/scripts/"
 humann2_path="/usr/bin/"
 
 # Manage arguments
-while getopts ":i:o:a:s:u:g:p:h" opt; do
+while getopts ":i:m:b:c:a:s:u:g:p:h" opt; do
     case $opt in
         i)
             input_file=$OPTARG >&2
             ;;
-        o)
-            output_file=$OPTARG >&2
+        m)
+            molecular_function_file=$OPTARG >&2
+            ;;
+        b)
+            biological_processes_file=$OPTARG >&2
+            ;;
+        c)
+            cellular_component_file=$OPTARG >&2
             ;;
         a)
             go_file=$OPTARG >&2
@@ -84,8 +94,18 @@ if [ -z $input_file ]; then
     shortUsage;
 fi
 
-if [ -z $output_file ]; then
-    msg_error "Missing argument: -o"
+if [ -z $molecular_function_file ]; then
+    msg_error "Missing argument: -m"
+    shortUsage;
+fi
+
+if [ -z $biological_processes_file ]; then
+    msg_error "Missing argument: -b"
+    shortUsage;
+fi
+
+if [ -z $cellular_component_file ]; then
+    msg_error "Missing argument: -c"
     shortUsage;
 fi
 
@@ -144,7 +164,9 @@ echo "========================"
 python $MY_PATH"/src/format_humann2_output.py" \
     --go_slim $slim_go_file \
     --humann2_output $tmp_data_dir"/humann2_slim_go_abundances.txt" \
-    --formatted_humann2_output $output_file
+    --molecular_function_output_file $molecular_function_file \
+    --biological_processes_output_file $biological_processes_file \
+    --cellular_component_output_file $cellular_component_file
 echo ""
 
 #rm -rf $tmp_data_dir
diff --git a/src/format_humann2_output.py b/src/format_humann2_output.py
@@ -49,28 +49,41 @@ def extract_go_slim_annotations(args):
 
 def format_humann2_output(args, go_annotations):
     with open(args.humann2_output, "r") as humann2_output:
-        with open(args.formatted_humann2_output,"w") as formatted_humann2_output:
-            formatted_humann2_output.write("GO id\tGO name\tGO namespace\tAbundance\n")
+        output_files = {}
+        output_files['molecular_function'] = open(args.molecular_function_output_file,"w")
+        output_files['molecular_function'].write("GO id\tGO name\tAbundance\n")
 
-            for line in humann2_output.readlines()[1:]:
-                split_line = line[:-1].split('\t')
-                go_id = split_line[0]
-                abundance = split_line[1]
+        output_files['biological_process'] = open(args.biological_processes_output_file,"w")
+        output_files['biological_process'].write("GO id\tGO name\tAbundance\n")
 
-                if not go_annotations.has_key(go_id):
-                    string = go_id + " has not found annotations"
-                    raise ValueError(string)
+        output_files['cellular_component'] = open(args.cellular_component_output_file,"w")
+        output_files['cellular_component'].write("GO id\tGO name\tAbundance\n")
 
-                formatted_humann2_output.write(go_id + '\t')
-                formatted_humann2_output.write(go_annotations[go_id]["name"] + '\t')
-                formatted_humann2_output.write(go_annotations[go_id]["namespace"] + '\t')
-                formatted_humann2_output.write(abundance + '\n')
+        for line in humann2_output.readlines()[1:]:
+            split_line = line[:-1].split('\t')
+            go_id = split_line[0]
+            abundance = split_line[1]
+            namespace = go_annotations[go_id]["namespace"]
+
+            if not go_annotations.has_key(go_id):
+                string = go_id + " has not found annotations"
+                raise ValueError(string)
+
+            output_files[namespace].write(go_id + '\t')
+            output_files[namespace].write(go_annotations[go_id]["name"] + '\t')
+            output_files[namespace].write(abundance + '\n')
+
+        output_files['molecular_function'].close()
+        output_files['biological_process'].close()
+        output_files['cellular_component'].close()
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--go_slim', required=True)
     parser.add_argument('--humann2_output', required=True)
-    parser.add_argument('--formatted_humann2_output', required=True)
+    parser.add_argument('--molecular_function_output_file', required=True)
+    parser.add_argument('--biological_processes_output_file', required=True)
+    parser.add_argument('--cellular_component_output_file', required=True)
     args = parser.parse_args()
 
     go_annotations = extract_go_slim_annotations(args)