-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathgffintersect.pl
executable file
·224 lines (202 loc) · 9.51 KB
/
gffintersect.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/perl
$usage .= "$0 - find intersection of two GFF files\n";
$usage .= "\n";
$usage .= "Usage: $0 [-sortedname|-sortedstart|-unsorted] [-name1 name] [-quiet] [-print1] [-minfrac1 xxx] [-minfrac2 xxx] [-near xxx] [-minhits xxx] [-maxhits xxx] [-not] [-all] [-self] [-shutit] file1 file2\n";
$usage .= "\n";
$usage .= "Prints lines from file2 that intersect (or do not intersect) with file1\n";
$usage .= "\n";
$usage .= "Use -sortedname if file1 and file2 known to be sorted by <seqname> field,\n";
$usage .= " or -sortedstart (stronger, default) if also sorted in ascending order of <start> field,\n";
$usage .= " or -unsorted if not sorted at all.\n";
$usage .= "Use -name1 if file1 is a name that won't look good in \"intersect(..)=(....)\" messages (e.g. a complex pipe command)\n";
$usage .= "Use -quiet to suppress \"intersect(..)=(....)\" messages altogether\n";
$usage .= "Use -print1 to copy all lines from file1 to output\n";
$usage .= "Use -minfrac1 and -minfrac2 to specify minimum fractional overlap for file1 and file2 features respectively\n";
$usage .= "Use -near to extend definition of \"overlap\" to nearby GFFs\n";
$usage .= "Use -minhits and -maxhits to specify minimum/maximum number of hits required to print lines from file2 (default is 1)\n";
$usage .= "Use -not to print lines from file2 that don't match the criteria rather than lines that do\n";
$usage .= "Use -all to print everything from file2\n";
$usage .= "Use -self for comparisons vs self (omit file2); also adds \"self&\" prefix to name1\n";
$usage .= "Use -shutit to suppress GFF format warnings\n";
$usage .= "\n";
# By default assume files are sorted by name and sequence start number.
$sortedname = $sortedstart = 1;
$minhits = 1;
while (@ARGV) {
last unless $ARGV[0] =~ /^-./; # Loop thru all the command line options.
$opt = lc shift; # Get lowercase version of option.
if ($opt eq "-not") { $not = 1 }
elsif ($opt eq "-all") { $all = 1 }
elsif ($opt eq "-self") { $self = 1 }
elsif ($opt eq "-quiet") { $quiet = 1 }
elsif ($opt eq "-print1") { $print1 = 1 }
elsif ($opt eq "-sortedname") { $sortedname = 1; $sortedstart = 0 }
elsif ($opt eq "-sortedstart") { $sortedname = $sortedstart = 1 }
elsif ($opt eq "-unsorted") { $sortedname = $sortedstart = 0 }
elsif ($opt eq "-minfrac1") { defined($minfrac1 = shift) or die $usage }
elsif ($opt eq "-minfrac2") { defined($minfrac2 = shift) or die $usage }
elsif ($opt eq "-minhits") { defined($minhits = shift) or die $usage }
elsif ($opt eq "-maxhits") { defined($maxhits = shift) or die $usage }
elsif ($opt eq "-near") { defined($near = shift) or die $usage }
elsif ($opt eq "-name1") { defined($name1 = shift) or die $usage }
elsif ($opt eq "-shutit") { $shutit = 1 }
else { die "$usage\nUnknown option: $opt\n" }
}
if (@ARGV==0 && $self) { die "Sorry - perl doesn't buffer STDIN, so you'll have to use a temporary file for that. Died" }
if (@ARGV==1) { push @ARGV, $self ? $ARGV[0] : '-' }
@ARGV==2 or die $usage;
($file1,$file2) = @ARGV;
$name1 = $file1 unless defined $name1;
if ($self) { $name1 = "self&$name1" }
open FILE1, $file1 or die "$file1: $!";
open FILE2, $file2 or die "$file2: $!";
# $line1 holds next line from file1; @f1 holds fields of $line1
# $line2 holds next line from file2; @f2 holds fields of $line2
# *FILE1 is a typeglob used to pass the filehandle to subroutine.
# \$n1 is a reference to $n1, which is a line number counter.
($line1,@f1) = getline(*FILE1,$file1,\$n1);
($line2,@f2) = getline(*FILE2,$file2,\$n2);
# If files are sorted by name, store a reference to file1 array.
# Note: @file1 is an array and $file1 is a scalar. They are different variables.
if ($sortedname) { $aref = \@file1 }
while (defined $line2) { # outer loop is over all single successive lines in file2
$seqname2 = $f2[0]; # Name value from current line in file 2.
# printbuffer1 holds lines to be printed from file1. output these now, if it's time.
while (@printbuffer1) {
my @f = split /\t/, $printbuffer1[0];
# Print line from buffer1 if (name1 < name2) OR
# if (name1 == name2) AND ( (files are not sorted by start number) OR (start1 <= start2) )
if ($f[0] lt $seqname2 or
($f[0] eq $seqname2 and (!$sortedstart or $f[3] <= $f2[3]))) {
print shift @printbuffer1 # Print one line from buffer
}
else {
last
}
}
# If files are sorted by name AND name2 is new, reset the file1 array and store name2 in lastName2.
if ($sortedname && $seqname2 ne $lastseqname2) {
@file1 = ();
$lastseqname2 = $seqname2
}
# Get lines from file1 and push GFF coords onto file1 stack.
#
# If pre-sorted by name ($sortedname=1) and start point ($sortedstart=1), store in file1 array the gff coords where
# seqname1=seqname2 and there is overlap between file1 coords and file2 coords.
# If $sortedname=1 and $sortedstart==0, store in file1 array the gff coords where seqname1=seqname2 and do not check
# for overlap.
# If $sortedname=0, push all GFF coords into file1 hash.
#
# The coords on the @file1 (or @{$file1{$seqname1}}) stack are strings of the form "start end linenumber"
# ...this is so that they are fast to sort by scalar numeric value
#
while (defined($line1) and $seqname1 = $f1[0], # seqname1 is assigned here
(!$sortedname or $seqname1 lt $seqname2 or
($seqname1 eq $seqname2 and (!$sortedstart or $f1[3] - $near <= $f2[4])))) {
if ($sortedname) {
if ($seqname1 eq $seqname2) {
# In this context, file1 is an array.
push @file1, "@f1[3,4] $n1" # start, end, line number
}
}
else {
# In this context, file1 is a hash with key = seqname1 and value = anonymous array reference.
push @{$file1{$seqname1}}, "@f1[3,4] $n1";
}
$arefsorted = 0;
# put lines from file1 into file1 printing buffer if -print1 switch is set
if ($print1) {
if (!$sortedname or $seqname1 lt $seqname2 or ($seqname1 eq $seqname2 and (!$sortedstart or $f1[3] <= $f2[3]))) {
print $line1
}
else {
push @printbuffer1, $line1
}
}
($line1,@f1) = getline(*FILE1,$file1,\$n1);
}
# $aref is a reference to the array that holds the GFF coords for lines from file1.
# this array must be sorted by startpoint - do this now if necessary
#
unless ($sortedname) { $aref = $file1{$seqname2} } # file1 hash is accessed
unless ($arefsorted) {
unless ($sortedstart) { @$aref = sort {$a<=>$b} @$aref }
$arefsorted = 1;
}
# Loop over every entry in @$aref until past $line2 endpoint.
# $skip and $hitstart2yet are used to keep track of entries in @$aref that may be discarded:
# while the $hitstart2yet flag is FALSE, $skip is incremented and later used to discard passed $line1's
#
@lines = ();
$hitstart2yet = $skip = 0;
($start2,$end2) = @f2[3,4];
for ($i=0;$i<@$aref;$i++) {
$entry1 = $aref->[$i];
last if $entry1 - $near > $end2;
($start1,$end1,$n) = split /\s+/, $entry1; # parse the "start end linenumber" string
# check for entries that may be discarded
#
if ($end1 + $near < $start2) { # No overlap between line1 and line2
if ($sortedstart and !$hitstart2yet) { $skip = $i + 1 }
} else {
# if line from file1 overlaps line from file2, store linenumber in @lines array
#
$hitstart2yet = 1;
# the "-near" switch extends our concept of "overlap" to nearby GFFs
# to do this, we calculate effective start and end points for
($effstart1,$effend1) = ($start1-$near/2,$end1+$near/2);
($effstart2,$effend2) = ($start2-$near/2,$end2+$near/2);
$maxstart = $effstart1 > $effstart2 ? $effstart1 : $effstart2;
$minend = $effend1 < $effend2 ? $effend1 : $effend2;
$overlaplen = $minend + 1 - $maxstart;
$len1 = $effend1 + 1 - $effstart1;
$len2 = $effend2 + 1 - $effstart2;
if (($len1 && $overlaplen/$len1 >= $minfrac1) && ($len2 && $overlaplen/$len2 >= $minfrac2)) {
push @lines, $n; # Save line number from file 1
}
}
}
while ($skip-- > 0) { shift @$aref }
# print if eligible
#
if (defined $maxhits) {
$test = (@lines >= $minhits && @lines <= $maxhits)
}
else {
$test = (@lines >= $minhits)
}
# $all==1 means print all lines from file2
# $not==1 means print lines from file2 which don't match the criteria
if ($test && ($all || !$not)) {
if ($quiet) { print $line2 }
else {
chomp $line2;
print "$line2 intersect($name1)=(@{[sort {$a<=>$b} @lines]})\n";
}
} elsif (!$test && ($all || $not)) {
print $line2;
}
($line2,@f2) = getline(*FILE2,$file2,\$n2);
} # while (defined $line2)
print $line1 if $print1 && defined $line1;
close FILE2;
close FILE1;
sub getline {
local (*FH) = shift; # Make a local copy of the filehandle typeglob.
my ($file,$nref) = @_; # @_ contains the rest of the subroutine arguments: filename and a reference to a counter
my $line; # Initially is undefined.
# Iterate if $line is undefined or does not contain non-whitespace characters.
while (!defined($line) || $line !~ /\S/) {
$line = <FH>; # Get next line from the file.
$line =~ s/\#.*//; # Remove comments from $line.
++$$nref; # Increment line number counter.
last if !defined($line)
}
my @f = split(/\t/,$line,9); # split line into 9 tab separated fields
if (@f == 0) { return (undef) }
if (!$shutit) {
if (@f < 9) { warn "Warning: fewer than 9 fields at $file line $$nref\n" }
if (join("",@f[0..7]) =~ / /) { warn "Warning: space in tab-delimited field at $file line $$nref\n" }
}
($line,@f); # Return the line and the fields.
}