From 1c94abd2f59a94bad58b09b1d51e11f467286c1b Mon Sep 17 00:00:00 2001 From: jraysajulga Date: Mon, 22 Apr 2019 11:06:32 -0500 Subject: [PATCH 1/6] add taxon filter --- src/format_humann2_output.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/format_humann2_output.py b/src/format_humann2_output.py index 0eeefd7..720ca28 100644 --- a/src/format_humann2_output.py +++ b/src/format_humann2_output.py @@ -64,7 +64,10 @@ def format_humann2_output(args, go_annotations): go_id = split_line[0] abundance = split_line[1] - if go_id == "UNGROUPED": + if '|' in go_id: + go_id = go_id.split('|')[0]; + + if go_id == "UNGROUPED" or go_id == "UNMAPPED": continue namespace = go_annotations[go_id]["namespace"] From d36fa7152ff4579722b615367ab20ddff0a434cb Mon Sep 17 00:00:00 2001 From: jraysajulga Date: Tue, 23 Apr 2019 14:22:02 -0500 Subject: [PATCH 2/6] add support for extra identifiers e.g., taxon --- src/format_humann2_output.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/format_humann2_output.py b/src/format_humann2_output.py index 720ca28..0d2ffe4 100644 --- a/src/format_humann2_output.py +++ b/src/format_humann2_output.py @@ -64,8 +64,10 @@ def format_humann2_output(args, go_annotations): go_id = split_line[0] abundance = split_line[1] + extra_id = '' if '|' in go_id: - go_id = go_id.split('|')[0]; + go_id, extra_id = go_id.split('|') + extra_id = "|" + extra_id if go_id == "UNGROUPED" or go_id == "UNMAPPED": continue @@ -77,7 +79,7 @@ def format_humann2_output(args, go_annotations): raise ValueError(string) output_files[namespace].write(go_id + '\t') - output_files[namespace].write(go_annotations[go_id]["name"] + '\t') + output_files[namespace].write(go_annotations[go_id]["name"] + extra_id + '\t') output_files[namespace].write(abundance + '\n') output_files['molecular_function'].close() From 90ceba6482fa08b4fbe6867d71e3830769132ed9 Mon Sep 17 00:00:00 2001 From: jraysajulga Date: Tue, 23 Apr 2019 14:38:08 -0500 Subject: [PATCH 3/6] add misc. column --- src/format_humann2_output.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/format_humann2_output.py b/src/format_humann2_output.py index 0d2ffe4..3008ada 100644 --- a/src/format_humann2_output.py +++ b/src/format_humann2_output.py @@ -51,13 +51,13 @@ def format_humann2_output(args, go_annotations): with open(args.humann2_output, "r") as humann2_output: output_files = {} output_files['molecular_function'] = open(args.molecular_function_output_file,"w") - output_files['molecular_function'].write("GO id\tGO name\tAbundance\n") + output_files['molecular_function'].write("GO id\tGO name\tAbundance\tMisc.\n") output_files['biological_process'] = open(args.biological_processes_output_file,"w") - output_files['biological_process'].write("GO id\tGO name\tAbundance\n") + output_files['biological_process'].write("GO id\tGO name\tAbundance\tMisc.\n") output_files['cellular_component'] = open(args.cellular_component_output_file,"w") - output_files['cellular_component'].write("GO id\tGO name\tAbundance\n") + output_files['cellular_component'].write("GO id\tGO name\tAbundance\tMisc.\n") for line in humann2_output.readlines()[1:]: split_line = line[:-1].split('\t') @@ -67,20 +67,19 @@ def format_humann2_output(args, go_annotations): extra_id = '' if '|' in go_id: go_id, extra_id = go_id.split('|') - extra_id = "|" + extra_id - + if go_id == "UNGROUPED" or go_id == "UNMAPPED": continue namespace = go_annotations[go_id]["namespace"] - - if not go_annotations.has_key(go_id): + if not go_id in go_annotations: string = go_id + " has not found annotations" raise ValueError(string) output_files[namespace].write(go_id + '\t') - output_files[namespace].write(go_annotations[go_id]["name"] + extra_id + '\t') - output_files[namespace].write(abundance + '\n') + output_files[namespace].write(go_annotations[go_id]["name"] + '\t') + output_files[namespace].write(abundance + '\t') + output_files[namespace].write(extra_id + '\n') output_files['molecular_function'].close() output_files['biological_process'].close() From 50615e9e597b52364d5b0707c8f926ff3792b98e Mon Sep 17 00:00:00 2001 From: jraysajulga Date: Wed, 24 Apr 2019 15:19:06 -0500 Subject: [PATCH 4/6] implement genus and species columns (implemented when taxIDs detected) --- src/format_humann2_output.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/format_humann2_output.py b/src/format_humann2_output.py index 3008ada..6aa3dd0 100644 --- a/src/format_humann2_output.py +++ b/src/format_humann2_output.py @@ -50,23 +50,35 @@ def extract_go_slim_annotations(args): def format_humann2_output(args, go_annotations): with open(args.humann2_output, "r") as humann2_output: output_files = {} + + # Check if any tax IDs are present then set up the appropriate header extension + tax_header = '' + if any(re.match('.+\|g__.+\.s__.+', line) for line in humann2_output): + tax_header = '\tGenus\tSpecies' + output_files['molecular_function'] = open(args.molecular_function_output_file,"w") - output_files['molecular_function'].write("GO id\tGO name\tAbundance\tMisc.\n") + output_files['molecular_function'].write("GO id\tGO name\tAbundance" + tax_header + "\n") output_files['biological_process'] = open(args.biological_processes_output_file,"w") - output_files['biological_process'].write("GO id\tGO name\tAbundance\tMisc.\n") + output_files['biological_process'].write("GO id\tGO name\tAbundance" + tax_header + "\n") output_files['cellular_component'] = open(args.cellular_component_output_file,"w") - output_files['cellular_component'].write("GO id\tGO name\tAbundance\tMisc.\n") + output_files['cellular_component'].write("GO id\tGO name\tAbundance" + tax_header + "\n") + + # Reset to beginning of file and skip header + humann2_output.seek(0) + next(humann2_output) - for line in humann2_output.readlines()[1:]: + for line in humann2_output: split_line = line[:-1].split('\t') go_id = split_line[0] abundance = split_line[1] - extra_id = '' + genus = species = '' if '|' in go_id: go_id, extra_id = go_id.split('|') + if re.match('g__.+\.s__.+', extra_id): + genus, species = re.match('g__(.+)\.s__(.+)', extra_id).groups() if go_id == "UNGROUPED" or go_id == "UNMAPPED": continue @@ -78,8 +90,12 @@ def format_humann2_output(args, go_annotations): output_files[namespace].write(go_id + '\t') output_files[namespace].write(go_annotations[go_id]["name"] + '\t') - output_files[namespace].write(abundance + '\t') - output_files[namespace].write(extra_id + '\n') + if tax_header == '': + output_files[namespace].write(abundance + '\n') + else: + output_files[namespace].write(abundance + '\t') + output_files[namespace].write(genus.replace('_',' ') + '\t') + output_files[namespace].write(species.replace('_',' ') + '\n') output_files['molecular_function'].close() output_files['biological_process'].close() From fe01ce07f0ddee2e5aa66423a0100773d82dfa50 Mon Sep 17 00:00:00 2001 From: jraysajulga Date: Wed, 24 Apr 2019 15:31:04 -0500 Subject: [PATCH 5/6] transfer |species.genus IDs to GO and namespace columns --- src/format_humann2_output.py | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/src/format_humann2_output.py b/src/format_humann2_output.py index 6aa3dd0..2687e9f 100644 --- a/src/format_humann2_output.py +++ b/src/format_humann2_output.py @@ -51,34 +51,26 @@ def format_humann2_output(args, go_annotations): with open(args.humann2_output, "r") as humann2_output: output_files = {} - # Check if any tax IDs are present then set up the appropriate header extension - tax_header = '' - if any(re.match('.+\|g__.+\.s__.+', line) for line in humann2_output): - tax_header = '\tGenus\tSpecies' - output_files['molecular_function'] = open(args.molecular_function_output_file,"w") - output_files['molecular_function'].write("GO id\tGO name\tAbundance" + tax_header + "\n") + output_files['molecular_function'].write("GO id\tGO name\tAbundance\n") output_files['biological_process'] = open(args.biological_processes_output_file,"w") - output_files['biological_process'].write("GO id\tGO name\tAbundance" + tax_header + "\n") + output_files['biological_process'].write("GO id\tGO name\tAbundance\n") output_files['cellular_component'] = open(args.cellular_component_output_file,"w") - output_files['cellular_component'].write("GO id\tGO name\tAbundance" + tax_header + "\n") + output_files['cellular_component'].write("GO id\tGO name\tAbundance\n") - # Reset to beginning of file and skip header - humann2_output.seek(0) + # skip header next(humann2_output) - for line in humann2_output: split_line = line[:-1].split('\t') go_id = split_line[0] abundance = split_line[1] - genus = species = '' + extra_id = '' if '|' in go_id: go_id, extra_id = go_id.split('|') - if re.match('g__.+\.s__.+', extra_id): - genus, species = re.match('g__(.+)\.s__(.+)', extra_id).groups() + extra_id = '|' + extra_id if go_id == "UNGROUPED" or go_id == "UNMAPPED": continue @@ -88,14 +80,9 @@ def format_humann2_output(args, go_annotations): string = go_id + " has not found annotations" raise ValueError(string) - output_files[namespace].write(go_id + '\t') - output_files[namespace].write(go_annotations[go_id]["name"] + '\t') - if tax_header == '': - output_files[namespace].write(abundance + '\n') - else: - output_files[namespace].write(abundance + '\t') - output_files[namespace].write(genus.replace('_',' ') + '\t') - output_files[namespace].write(species.replace('_',' ') + '\n') + output_files[namespace].write(go_id + extra_id + '\t') + output_files[namespace].write(go_annotations[go_id]["name"] + extra_id + '\t') + output_files[namespace].write(abundance + '\n') output_files['molecular_function'].close() output_files['biological_process'].close() From 0ead6a982a38a3029764c903405230030da56b2c Mon Sep 17 00:00:00 2001 From: jraysajulga Date: Wed, 24 Apr 2019 15:36:36 -0500 Subject: [PATCH 6/6] update expected test outputs to truncated numbers from humann2 v11 --- .../expected_biological_process_abundance.txt | 94 +++++++++---------- .../expected_cellular_component_abundance.txt | 50 +++++----- ...expected_molecular_function_abundances.txt | 80 ++++++++-------- 3 files changed, 112 insertions(+), 112 deletions(-) diff --git a/test-data/expected_biological_process_abundance.txt b/test-data/expected_biological_process_abundance.txt index da29959..d244c0c 100644 --- a/test-data/expected_biological_process_abundance.txt +++ b/test-data/expected_biological_process_abundance.txt @@ -1,47 +1,47 @@ -GO id GO name Abundance -GO:0000160 phosphorelay signal transduction system 1112.3915515778 -GO:0000746 conjugation 46.559859807600006 -GO:0000902 cell morphogenesis 1.8470267351 -GO:0002376 immune system process 23.700662623200003 -GO:0005975 carbohydrate metabolic process 2612.6970692966993 -GO:0006091 generation of precursor metabolites and energy 943.0642504650992 -GO:0006259 DNA metabolic process 2760.3666308885995 -GO:0006351 transcription, DNA-templated 0.0615114205 -GO:0006412 translation 3280.1516158372006 -GO:0006520 cellular amino acid metabolic process 3953.6382689974002 -GO:0006629 lipid metabolic process 1380.8390328801004 -GO:0006790 sulfur compound metabolic process 1012.3221601218003 -GO:0006807 nitrogen compound metabolic process 3915.4792502578994 -GO:0006810 transport 1101.0465624405 -GO:0006811 ion transport 1276.2633195375001 -GO:0006950 response to stress 954.8641672250999 -GO:0007154 cell communication 316.8910362473 -GO:0007165 signal transduction 45.3316263193 -GO:0008150 biological_process 2784.3062048235997 -GO:0009056 catabolic process 1728.292233976299 -GO:0009058 biosynthetic process 12048.508539055703 -GO:0009117 nucleotide metabolic process 2472.3435074350996 -GO:0009306 protein secretion 23.141466690899996 -GO:0009404 toxin metabolic process 9.095920561400002 -GO:0009405 pathogenesis 58.27509464559998 -GO:0015979 photosynthesis 44.73480409939999 -GO:0016032 viral process 0.7986287266 -GO:0016043 cellular component organization 1246.5641915709 -GO:0016049 cell growth 4.1997618312 -GO:0016070 RNA metabolic process 1855.2220049666994 -GO:0016310 phosphorylation 1372.0241480503992 -GO:0019538 protein metabolic process 672.0486860101004 -GO:0022900 electron transport chain 103.16706894990003 -GO:0030031 cell projection assembly 32.495419504000004 -GO:0032196 transposition 300.7048225378001 -GO:0042592 homeostatic process 306.4798266318999 -GO:0043934 sporulation 174.71346946979997 -GO:0044281 small molecule metabolic process 4445.203127840403 -GO:0045333 cellular respiration 5.518723520100001 -GO:0046034 ATP metabolic process 1002.5305430942991 -GO:0048870 cell motility 107.9960707376 -GO:0051186 cofactor metabolic process 2456.361318531299 -GO:0051301 cell division 864.5391215972995 -GO:0055114 oxidation-reduction process 215.51031499180004 -GO:0071554 cell wall organization or biogenesis 36.7500063207 -GO:0071941 nitrogen cycle metabolic process 56.684053508 +GO id GO name Abundance +GO:0000160 phosphorelay signal transduction system 1112.392 +GO:0000746 conjugation 46.56 +GO:0000902 cell morphogenesis 1.847 +GO:0002376 immune system process 23.701 +GO:0005975 carbohydrate metabolic process 2612.697 +GO:0006091 generation of precursor metabolites and energy 943.064 +GO:0006259 DNA metabolic process 2760.367 +GO:0006351 transcription, DNA-templated 0.062 +GO:0006412 translation 3280.152 +GO:0006520 cellular amino acid metabolic process 3953.638 +GO:0006629 lipid metabolic process 1380.837 +GO:0006790 sulfur compound metabolic process 1012.321 +GO:0006807 nitrogen compound metabolic process 3915.478 +GO:0006810 transport 1101.046 +GO:0006811 ion transport 1276.261 +GO:0006950 response to stress 954.866 +GO:0007154 cell communication 316.891 +GO:0007165 signal transduction 45.33 +GO:0008150 biological_process 2784.3 +GO:0009056 catabolic process 1728.292 +GO:0009058 biosynthetic process 12048.505 +GO:0009117 nucleotide metabolic process 2472.344 +GO:0009306 protein secretion 23.141 +GO:0009404 toxin metabolic process 9.096 +GO:0009405 pathogenesis 58.275 +GO:0015979 photosynthesis 44.735 +GO:0016032 viral process 0.799 +GO:0016043 cellular component organization 1246.564 +GO:0016049 cell growth 4.2 +GO:0016070 RNA metabolic process 1855.223 +GO:0016310 phosphorylation 1372.024 +GO:0019538 protein metabolic process 672.049 +GO:0022900 electron transport chain 103.167 +GO:0030031 cell projection assembly 32.495 +GO:0032196 transposition 300.705 +GO:0042592 homeostatic process 306.48 +GO:0043934 sporulation 174.713 +GO:0044281 small molecule metabolic process 4445.202 +GO:0045333 cellular respiration 5.519 +GO:0046034 ATP metabolic process 1002.531 +GO:0048870 cell motility 107.996 +GO:0051186 cofactor metabolic process 2456.361 +GO:0051301 cell division 864.539 +GO:0055114 oxidation-reduction process 215.51 +GO:0071554 cell wall organization or biogenesis 36.75 +GO:0071941 nitrogen cycle metabolic process 56.684 diff --git a/test-data/expected_cellular_component_abundance.txt b/test-data/expected_cellular_component_abundance.txt index 3df617f..7582e9a 100644 --- a/test-data/expected_cellular_component_abundance.txt +++ b/test-data/expected_cellular_component_abundance.txt @@ -1,25 +1,25 @@ -GO id GO name Abundance -GO:0005575 cellular_component 4211.620011669099 -GO:0005618 cell wall 101.24244809220001 -GO:0005634 nucleus 86.82707381589996 -GO:0005694 chromosome 0.0271872006 -GO:0005727 extrachromosomal circular DNA 121.82450720269999 -GO:0005737 cytoplasm 9919.697399687224 -GO:0005739 mitochondrion 193.03667393669969 -GO:0005773 vacuole 2.2073187229999993 -GO:0005783 endoplasmic reticulum 13.858678920100003 -GO:0005794 Golgi apparatus 11.580712904300004 -GO:0005840 ribosome 1635.3364950416994 -GO:0005856 cytoskeleton 0.1781304753 -GO:0005886 plasma membrane 2956.1483673840053 -GO:0009295 nucleoid 56.116239729200025 -GO:0009579 thylakoid 16.531472969800003 -GO:0016020 membrane 76.73590696070002 -GO:0019012 virion 0.2407286574 -GO:0019867 outer membrane 502.1513331402007 -GO:0031012 extracellular matrix 0.5991089999 -GO:0031982 vesicle 81.09742051820007 -GO:0042575 DNA polymerase complex 65.46696539999999 -GO:0042597 periplasmic space 130.1736765559 -GO:0043190 ATP-binding cassette (ABC) transporter complex 271.6521785019997 -GO:0070469 respiratory chain 2.0447839356 +GO id GO name Abundance +GO:0005575 cellular_component 4211.617 +GO:0005618 cell wall 101.242 +GO:0005634 nucleus 86.827 +GO:0005694 chromosome 0.027 +GO:0005727 extrachromosomal circular DNA 121.825 +GO:0005737 cytoplasm 9919.697 +GO:0005739 mitochondrion 193.037 +GO:0005773 vacuole 2.207 +GO:0005783 endoplasmic reticulum 13.859 +GO:0005794 Golgi apparatus 11.581 +GO:0005840 ribosome 1635.336 +GO:0005856 cytoskeleton 0.178 +GO:0005886 plasma membrane 2956.148 +GO:0009295 nucleoid 56.116 +GO:0009579 thylakoid 16.531 +GO:0016020 membrane 76.737 +GO:0019012 virion 0.241 +GO:0019867 outer membrane 502.152 +GO:0031012 extracellular matrix 0.599 +GO:0031982 vesicle 81.097 +GO:0042575 DNA polymerase complex 65.467 +GO:0042597 periplasmic space 130.174 +GO:0043190 ATP-binding cassette (ABC) transporter complex 271.652 +GO:0070469 respiratory chain 2.045 diff --git a/test-data/expected_molecular_function_abundances.txt b/test-data/expected_molecular_function_abundances.txt index 063e528..7a204db 100644 --- a/test-data/expected_molecular_function_abundances.txt +++ b/test-data/expected_molecular_function_abundances.txt @@ -1,40 +1,40 @@ -GO id GO name Abundance -GO:0000156 phosphorelay response regulator activity 4.4347360838 -GO:0000166 nucleotide binding 11436.730254302318 -GO:0000988 transcription factor activity, protein binding 673.7839975407998 -GO:0003674 molecular_function 4087.6458012782 -GO:0003676 nucleic acid binding 3028.700666733601 -GO:0003924 GTPase activity 490.16358835940014 -GO:0004386 helicase activity 273.26291990489995 -GO:0004518 nuclease activity 708.2423017374001 -GO:0004672 protein kinase activity 626.4942495674997 -GO:0004812 aminoacyl-tRNA ligase activity 234.25362423819996 -GO:0004872 receptor activity 626.4846539767997 -GO:0005102 receptor binding 1.5301407557 -GO:0005215 transporter activity 1762.0709654024004 -GO:0008092 cytoskeletal protein binding 7.383357547800001 -GO:0008134 transcription factor binding 84.63172726779997 -GO:0008233 peptidase activity 1182.3108289962993 -GO:0016298 lipase activity 3.6081972003000002 -GO:0016301 kinase activity 937.2682550285 -GO:0016407 acetyltransferase activity 65.750125095 -GO:0016491 oxidoreductase activity 2051.3275224607996 -GO:0016740 transferase activity 2598.2115822036 -GO:0016746 transferase activity, transferring acyl groups 212.45457835869996 -GO:0016757 transferase activity, transferring glycosyl groups 319.0056774039 -GO:0016765 transferase activity, transferring alkyl or aryl (other than methyl) groups 161.48389564689995 -GO:0016787 hydrolase activity 280.95384492459993 -GO:0016788 hydrolase activity, acting on ester bonds 549.7984070913 -GO:0016791 phosphatase activity 148.29697741039996 -GO:0016798 hydrolase activity, acting on glycosyl bonds 650.9218081131 -GO:0016810 hydrolase activity, acting on carbon-nitrogen (but not peptide) bonds 428.24025912369996 -GO:0016829 lyase activity 575.0019956493999 -GO:0016853 isomerase activity 1916.3316482209 -GO:0016874 ligase activity 675.5332395482 -GO:0016887 ATPase activity 965.0511058454001 -GO:0030234 enzyme regulator activity 31.695909846600003 -GO:0034061 DNA polymerase activity 290.6773463926 -GO:0034062 RNA polymerase activity 233.7752678209 -GO:0046872 metal ion binding 4116.705081461401 -GO:0048037 cofactor binding 2930.3128404498993 -GO:0051082 unfolded protein binding 203.18389855299998 +GO id GO name Abundance +GO:0000156 phosphorelay response regulator activity 4.435 +GO:0000166 nucleotide binding 11436.73 +GO:0000988 transcription factor activity, protein binding 673.784 +GO:0003674 molecular_function 4087.644 +GO:0003676 nucleic acid binding 3028.702 +GO:0003924 GTPase activity 490.164 +GO:0004386 helicase activity 273.263 +GO:0004518 nuclease activity 708.243 +GO:0004672 protein kinase activity 626.495 +GO:0004812 aminoacyl-tRNA ligase activity 234.254 +GO:0004872 receptor activity 626.485 +GO:0005102 receptor binding 1.53 +GO:0005215 transporter activity 1762.068 +GO:0008092 cytoskeletal protein binding 7.383 +GO:0008134 transcription factor binding 84.632 +GO:0008233 peptidase activity 1182.31 +GO:0016298 lipase activity 3.608 +GO:0016301 kinase activity 937.268 +GO:0016407 acetyltransferase activity 65.75 +GO:0016491 oxidoreductase activity 2051.329 +GO:0016740 transferase activity 2598.211 +GO:0016746 transferase activity, transferring acyl groups 212.455 +GO:0016757 transferase activity, transferring glycosyl groups 319.006 +GO:0016765 transferase activity, transferring alkyl or aryl (other than methyl) groups 161.484 +GO:0016787 hydrolase activity 280.954 +GO:0016788 hydrolase activity, acting on ester bonds 549.799 +GO:0016791 phosphatase activity 148.298 +GO:0016798 hydrolase activity, acting on glycosyl bonds 650.921 +GO:0016810 hydrolase activity, acting on carbon-nitrogen (but not peptide) bonds 428.241 +GO:0016829 lyase activity 575.003 +GO:0016853 isomerase activity 1916.331 +GO:0016874 ligase activity 675.532 +GO:0016887 ATPase activity 965.05 +GO:0030234 enzyme regulator activity 31.696 +GO:0034061 DNA polymerase activity 290.677 +GO:0034062 RNA polymerase activity 233.775 +GO:0046872 metal ion binding 4116.705 +GO:0048037 cofactor binding 2930.313 +GO:0051082 unfolded protein binding 203.184