-
Notifications
You must be signed in to change notification settings - Fork 0
/
GET_TAXID_FROM_GI.pl
87 lines (73 loc) · 2.35 KB
/
GET_TAXID_FROM_GI.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/perl
use strict;
use warnings;
use feature qw(say);
use Getopt::Long;
use Pod::Usage;
my $file;
my $db;
my $usage = "\n$0 [options] \n
Options:
e.g. usage: perl $0 -file NP_001319.nr.blastp.xml.parsed.gi \
-db nr >NP_001319.nr.blastp.xml.parsed.gi.taxID.txt
-file File with gi numbers - one on each file
-db Blast database to use e.g. nr or nt
-help Show this message
";
GetOptions(
'file=s' =>\$file,
'db=s' =>\$db,
help => sub { pod2usage($usage) },
) or die($usage);
#set up options
unless ($file){
die "\nProvide a file with gi numbers - one on each line, -file <infile.txt>",
$usage;
}
unless ($db){
die "\nBlast database to use e.g. nr or nt, -db <nr>", $usage;
}
#check if values of option are provided. If not, die
my $inFH;
unless (open ($inFH, '<', $file)){
die "can't open $file", $! ;
}
say join("\t", "Taxon Id", "Common Name", "Scientific Name");
#print the headline
while(<$inFH>){
my $gi = $_;
if ($gi !~ /\d+/){
say STDERR "This is not a gi (", $gi, "), skipping\n";
next;
}
my ($taxon_id, $common_name,$scientific_name) = getTaxidFromGi($gi, $db);
say join ("\t", $taxon_id, $common_name, $scientific_name);
}
close $inFH;
#print the information I need
sub getTaxidFromGi{
my ($gi, $db) = @_;
chomp $gi;
my $cli = 'blastdbcmd -db ' . $db . ' -entry ' . $gi . ' -outfmt "NCBI Taxonomy id: %T; Common name: %L; Scientific name: %S" -target_only | ';
my $sysFH;
my ($taxon_id, $common_name,$scientific_name) = ("NA", "NA", "NA");
unless (open($sysFH, $cli)){
die "Can't open the system call ", $cli, "\n";
}
while(<$sysFH>){
chomp;
if ($_ =~ /NCBI Taxonomy ID:\s+(\d+)/i){
$taxon_id = $1;
}
if ($_ =~ /Common Name:\s+(.*?);/i){
$common_name = $1;
}
if ($_ =~ /Scientific Name:\s(.*)/i){
$scientific_name = $1;
}
my ($taxon_id, $common_name, $scientific_name);
}
close $sysFH;
return ($taxon_id, $common_name, $scientific_name);
}
#get taxon ID, common name and scientific name