Skip to content

Commit

Permalink
Split output file into 3 files (molecular, biological and cellular)
Browse files Browse the repository at this point in the history
  • Loading branch information
bebatut committed Feb 22, 2016
1 parent 4828933 commit 54e5cab
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 22 deletions.
38 changes: 30 additions & 8 deletions group_to_GO_abundances.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ function shortUsage() {

echo "Required options:"
prettyOpt "-i" "Path to file with UniRef50 gene family abundance (HUMAnN2 output)"
prettyOpt "-o" "Path to output file which will contain GO slim term abudances"
prettyOpt "-m" "Path to file which will contain GO slim term abudances corresponding to molecular functions"
prettyOpt "-b" "Path to file which will contain GO slim term abudances corresponding to biological processes"
prettyOpt "-c" "Path to file which will contain GO slim term abudances corresponding to cellular components"
echo
echo "Other options:"
prettyOpt "-a" "Path to basic Gene Ontology file"
Expand All @@ -32,21 +34,29 @@ function shortUsage() {
MY_PATH=`dirname "$0"`

input_file=""
output_file=""
molecular_function_file=""
biological_processes_file=""
cellular_component_file=""
go_file=$MY_PATH"/data/go.obo"
slim_go_file=$MY_PATH"/data/goslim_metagenomics.obo"
humann2_uniref_go=$MY_PATH"/data/map_infogo1000_uniref50.txt"
goatools_path="goatools/scripts/"
humann2_path="/usr/bin/"

# Manage arguments
while getopts ":i:o:a:s:u:g:p:h" opt; do
while getopts ":i:m:b:c:a:s:u:g:p:h" opt; do
case $opt in
i)
input_file=$OPTARG >&2
;;
o)
output_file=$OPTARG >&2
m)
molecular_function_file=$OPTARG >&2
;;
b)
biological_processes_file=$OPTARG >&2
;;
c)
cellular_component_file=$OPTARG >&2
;;
a)
go_file=$OPTARG >&2
Expand Down Expand Up @@ -84,8 +94,18 @@ if [ -z $input_file ]; then
shortUsage;
fi

if [ -z $output_file ]; then
msg_error "Missing argument: -o"
if [ -z $molecular_function_file ]; then
msg_error "Missing argument: -m"
shortUsage;
fi

if [ -z $biological_processes_file ]; then
msg_error "Missing argument: -b"
shortUsage;
fi

if [ -z $cellular_component_file ]; then
msg_error "Missing argument: -c"
shortUsage;
fi

Expand Down Expand Up @@ -144,7 +164,9 @@ echo "========================"
python $MY_PATH"/src/format_humann2_output.py" \
--go_slim $slim_go_file \
--humann2_output $tmp_data_dir"/humann2_slim_go_abundances.txt" \
--formatted_humann2_output $output_file
--molecular_function_output_file $molecular_function_file \
--biological_processes_output_file $biological_processes_file \
--cellular_component_output_file $cellular_component_file
echo ""

#rm -rf $tmp_data_dir
41 changes: 27 additions & 14 deletions src/format_humann2_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,28 +49,41 @@ def extract_go_slim_annotations(args):

def format_humann2_output(args, go_annotations):
with open(args.humann2_output, "r") as humann2_output:
with open(args.formatted_humann2_output,"w") as formatted_humann2_output:
formatted_humann2_output.write("GO id\tGO name\tGO namespace\tAbundance\n")
output_files = {}
output_files['molecular_function'] = open(args.molecular_function_output_file,"w")
output_files['molecular_function'].write("GO id\tGO name\tAbundance\n")

for line in humann2_output.readlines()[1:]:
split_line = line[:-1].split('\t')
go_id = split_line[0]
abundance = split_line[1]
output_files['biological_process'] = open(args.biological_processes_output_file,"w")
output_files['biological_process'].write("GO id\tGO name\tAbundance\n")

if not go_annotations.has_key(go_id):
string = go_id + " has not found annotations"
raise ValueError(string)
output_files['cellular_component'] = open(args.cellular_component_output_file,"w")
output_files['cellular_component'].write("GO id\tGO name\tAbundance\n")

formatted_humann2_output.write(go_id + '\t')
formatted_humann2_output.write(go_annotations[go_id]["name"] + '\t')
formatted_humann2_output.write(go_annotations[go_id]["namespace"] + '\t')
formatted_humann2_output.write(abundance + '\n')
for line in humann2_output.readlines()[1:]:
split_line = line[:-1].split('\t')
go_id = split_line[0]
abundance = split_line[1]
namespace = go_annotations[go_id]["namespace"]

if not go_annotations.has_key(go_id):
string = go_id + " has not found annotations"
raise ValueError(string)

output_files[namespace].write(go_id + '\t')
output_files[namespace].write(go_annotations[go_id]["name"] + '\t')
output_files[namespace].write(abundance + '\n')

output_files['molecular_function'].close()
output_files['biological_process'].close()
output_files['cellular_component'].close()

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--go_slim', required=True)
parser.add_argument('--humann2_output', required=True)
parser.add_argument('--formatted_humann2_output', required=True)
parser.add_argument('--molecular_function_output_file', required=True)
parser.add_argument('--biological_processes_output_file', required=True)
parser.add_argument('--cellular_component_output_file', required=True)
args = parser.parse_args()

go_annotations = extract_go_slim_annotations(args)
Expand Down

0 comments on commit 54e5cab

Please sign in to comment.