blast2gff_2ndFile.pl

#!/usr/bin/perl
# blast92gff3.pl

=head1 NAME

  blast92gff3.pl
  unpacked that is " BLAST tabular output (-m 9 or 8) conversion to GFF version 3 format "

=head1 SYNOPSIS

  ncbi/blastall -m 9 -p tblastn -i daphnia_genes.aa -d aphid-genome -o aphid-daphnia_genes.tblastn
  cat aphid-daphnia_genes.tblastn | sort -k1,1 | blast92gff2.pl > aphid-daphnia_genes.gff

  Note: input must be sorted by query ID.  Default .tblastn result is sorted by query.

=head2 BlastX and tBlastN

  This should work the same for BlastX, if you sort on column 2 and use -swap:
     blastall -p blastx -d genes.aa -i genome  | sort -k2,2 | blast92gff2.pl -swap
  It should work for DNA queries like EST/cDNA mapped to a genome also.

=head1 ABOUT

  Separate BLAST tabular output (-m 9 or 8) into gene models (match,HSP) by parsing distinct,
  duplicate gene matches using the query locations as well as source locations.

  This works on one query gene at a time.  The result likely will have many overlapped
  different genes of declining quality.  You can filter those in a 2nd step:
   $td/overbestgene2.perl -in aphid-daphnia_genes.gff > aphid-daphnia_genes-best.gff

  Learn more on the subject of accurately locating duplicate genes at
  http://eugenes.org/gmod/tandy/

  See this note on how BioPerl combines distinct gene matches from blat, blast in its
  searchio module
  http://www.bioperl.org/wiki/Talk:GFF_code_audit


=head2 TEST data

  lots of "insect" cuticle tandem duplicate genes in
  daphnia_pulex/scaffold_14:1294000..1322000

  http://insects.eugenes.org/cgi-bin/gbrowsenew/gbrowse/daphnia_pulex/?name=scaffold_14:1294000..1322000

  Pick the protein of one cuticle gene,  daphnia:NCBI_GNO_546144
  Run tblastn at wfleabase.org of NCBI_GNO_546144 protein x genome, saving tabular blast result
  blast92gff2.pl < result.tblastn

=head1 AUTHOR

  Don Gilbert, gilbertd@indiana.edu  June 2008

=head1 NOTES

=cut


use strict;

use constant MAX_EXON_SEPARATION => 10000; # 50000;  # make an option
use constant QBASEOVER => 5; # query hsp overlap slop in bases
# use constant GFF_LOC => 1;

my $OVERLAP_SLOP_QUERY  = 0.15; # was 0.15; for protein query HSPs
my $OVERLAP_SLOP_GENOME = 0.05; # was 0.50; for genome source HSPs
my $BIN10000= 10000; # binsize
my $GAPSIZE = 400; # what ?? for nearover1() test

my $LOWSCORE_SKIP  = 0.20; # i.e. skip matchs < 50% of max match score

use Getopt::Long;

use vars qw(
$swap_querytarget
$faprefix $debug
$querySource $exonType
$geneType  $dotarget
$max_bit_score
$bitcutoff $stringent2ndary
%besthash %sumhash @qparthsps @sourcehsps
@mainexons @secondexons @allsaved %moregenes $species $queryid $npart
%shredhash %tandhash $tophsp $lqid $lsid $nwarn $outfile
$gffout %didid
);


my $addgene= 1;
$geneType="match"; # or "protein_match" ...;
$exonType="HSP"; # or "match_part";
$querySource="blast";
$faprefix= 'gnl\|'; # drop NCBI extra id prefix
$stringent2ndary=1;
$dotarget= 1;

#  do own sorting for score, loc : still need queryID sort ...
#  ** WARNING : input blast must be sorted by queryID, and best score (sort -k1,1 -k12,12nr) **
#  cat  modelproteins-mygenome.tblastn| sort -k1,1 -k12,12nr |\

my $USAGE =<<"USAGE";
   ** WARNING : input blast must be sorted by queryID  (sort -k1,1 ) **
 cat  modelproteins-mygenome.tblastn| sort -k1,1 |\
      blast92gff3.pl [options]  > modelproteins-mygenome.gff
 options: -swap : swap query, target
 -source $querySource : gff.source field
 -exonType $exonType , -geneType $geneType  : gff feature types
 -lowscore $LOWSCORE_SKIP : skip matches < $LOWSCORE_SKIP of max score
 -notarget, -nomatch  :  change gff output

USAGE

my $ok=&GetOptions(
  "output=s" => \$outfile,
  "qoverlap=s" => \$OVERLAP_SLOP_QUERY,
  "overlap=s" => \$OVERLAP_SLOP_GENOME,
  "LOWSCORE_SKIP=s" => \$LOWSCORE_SKIP,
  "source=s", \$querySource, # also == GFF source ?
  "geneType=s", \$geneType,
  "exonType=s", \$exonType,
  "swap_querytarget!" => \$swap_querytarget,
  'stringent2ndary!' => \$stringent2ndary,
  "faprefix=s", \$faprefix,
  "target!" => \$dotarget,
  "match!" => \$addgene,
  "debug!" => \$debug,
  );
die $USAGE unless($ok);


sub reset_target_vars {
  printGeneLocations(@_);

  $npart= 0;
  $max_bit_score= 0;
  $bitcutoff=0;
  %shredhash=();
  %tandhash=(); #? keep? this ids query regardless of target
  $tophsp='';
  @qparthsps= @sourcehsps= @allsaved= @mainexons= @secondexons= ();
  %moregenes=();
}


my $hascomm=0;
$gffout = *STDOUT;
if( $outfile && open(OUTH,">$outfile")) { $gffout= *OUTH; }
print $gffout "##gff-version 3\n"; ## if($dolocs);

while(<>) {
  chomp;

  if(/^\w/) {  # dont assume blast input has comments .. -m8
    my @v= split "\t";
    my ($qid, $sid, $pctident, $alignment_length, $mismatches, $gap_openings,
       $q_start, $q_end, $s_start, $s_end, $prob, $bit_score ) = @v;
    next unless($bit_score); # some error logs mixed in.. ## warn, die ??

    cleanid($qid);
    cleanid($sid);
    if($swap_querytarget) {
      ($qid,$sid,$q_start,$q_end,$s_start,$s_end)=
      ($sid,$qid,$s_start,$s_end,$q_start,$q_end);
      }

if(1) { # query/gene batching, not target/scaffold, with sort -k1,1 input
    reset_target_vars($lqid) if($qid ne $lqid);
} else {
    reset_target_vars($lsid) if($sid ne $lsid);
}

    my($s_strand,$q_strand)= ('+','+');
    if ($s_start > $s_end) { $s_strand='-'; ($s_start,$s_end)= ($s_end,$s_start);  }
    if ($q_start > $q_end) { $q_strand='-'; ($q_start,$q_end)= ($q_end,$q_start);  }

#     my($sh_id,$sh_start,$sh_end)= deshredloc($sid);
#     $s_start= $sh_start + $s_start;
#     $s_end  = $sh_start + $s_end;
#     $sid    = $sh_id;

#     my $skey="$sid.$q_start.$s_start.$s_end.$bit_score";
#     next if($shredhash{$skey});
#     $shredhash{$skey}++;

    # if ( $bitcutoff == 0 ) { $bitcutoff= $bit_score * 0.75; } #? 1st == best; also  $npart == 0
    # elsif ( $stringent2ndary && $bit_score < $bitcutoff && ($sid ne $lsid) ) { next; }

    ## binned top hits for filterBesthits ; can we do this inline ?
    # NOT USED NOW# storeBesthits($qid,$sid,$s_start,$s_end,$bit_score);

    # count hsps same query loc, diff db loc
    $tophsp= $sid if ($npart==0);

    my $tkey= "$qid-HSP:$q_start,$q_end";
    my $saved= 0;
    my $qoverlapped=0;
    my $soverlapped=0;  ## need only for genelocs.other ?

# if(0) { # need this anymore ?
#     foreach my $ex (@qparthsps) {
#       my($bs,$be)= ($$ex[3],$$ex[4]);
#       $qoverlapped= overlap1($q_start, $q_end, $bs, $be, $OVERLAP_SLOP_QUERY);
#       if ($qoverlapped) { $tkey="$qid-HSP:$bs,$be"; last; }
#       }
#
#     foreach my $ex (@sourcehsps) {
#       unless($$ex[1] ne $sid || $$ex[0] ne $qid) {
#         my($bs,$be)= ($$ex[5],$$ex[6]);
#         $soverlapped= overlap1($s_start, $s_end, $bs, $be, $OVERLAP_SLOP_GENOME);
#         last if $soverlapped;
#         }
#       }
# }

    # this is where we assume input is sorted by query-id and top bitscore :
    # we drop overlapped hsps silently here ... should reinstitute stringent2ndary test
    next if( $qoverlapped && $soverlapped ); # this happens only for same query, same scaffold
    ##next if($stringent2ndary && $bit_score < $bitcutoff && ($sid ne $lsid));

    my $hspval= [$qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
      $bit_score,  $prob, $tkey, $s_strand,
      $soverlapped, $qoverlapped];
    push(@qparthsps, $hspval) unless($qoverlapped);
    push(@sourcehsps, $hspval); # unless($qoverlapped && $soverlapped); # use this as recycled hits/scaffold

    $npart++; $lqid= $qid; $lsid= $sid;
  } # blast line
} # while(<>)


reset_target_vars($lqid); ## printGeneLocations();
close($gffout) if($gffout);

if(1) { # debug/verbose / stats of parts saved ...
  my($tpart,@tkeys,$v);
  my @sumkeys= sort keys %sumhash;
  warn "# Summary of HSPs saved\n";
  foreach $tpart (@sumkeys) {
    @tkeys= sort keys %{$sumhash{$tpart}};
    foreach (@tkeys) { $v= $sumhash{$tpart}{$_}; warn "# $tpart $_ = $v\n"; }
  }
}

# end ...................................................



=head2 yet another BLAST assignBestgene algorithm

  algo1 isnt good enough: sort by qgene, source-loc; join hsp by next-nooverlap
  new algo3:
      1. input sort by query-gene, top bitscore
      2. save list of top hsp (score), resort hsp by source-location
      3. for top-hsp down, join any next-door left & right same-strand hsp
          mark these as G1..n, save
      4.  continue w/ next-top hsp, eliminating source-overlapped w/ saved G hsp

=head2 original algorithm from blast9protstats (messy)

  # step1: -p loc
  gtar -Ozxf $soc/${dp}?prot9.blout*.tgz | \
   perl -n $bg/blast/blast9protstats.pl -p loc > ! ${dp}prot9.blexons &

  # step2: sort hsps with location bins ; should add this to perl
  # ** need this binning to separate, keep best hits/region
  # makeblexonsort.sh
  foreach ble ($dp*.blexons)
  set blo=`echo $ble | sed -e's/blexons/blexonsort/'`
  echo ====== $ble : $blo
  cat $ble | perl -ne\
  '($id)=m/tkey=(\S+)\-HSP:/; ($r)=m/^\w+:(\S+)/; ($tb,$te)=m/tloc=(\d+),(\d+)/;\
  @v=split; $db=$v[1]; $sbin=10000*int($v[3]/10000);  \
  print join("\t",$db,$r,$sbin,$v[5],$v[3],$v[4],$id,$tb,$te),"\n";'  \
  | sort -k1,2 -k3,3n -k4,4nr \
  > $blo
  echo
  end

  # step3: -p table (default)
  gtar -Ozxf $soc/${dp}?prot9.blout*.tgz | \
   perl -n $bg/blast/blast9protstats.pl -over 0.5 -besthits ${dp}prot9.blexonsort  -out ${dp}prot9.bltab7 &

=cut

sub _sortHsp_Score {
  my($ap)= $a->[7];
  my($bp)= $b->[7];
  return ($bp <=> $ap); # top score 1st
}

sub _sortHsp_SrcLocation  {
  my($ar,$ab,$ae,$ao)= @{$a}[1,5,6,10];
  my($br,$bb,$be,$bo)= @{$b}[1,5,6,10];
  my $ocmp= ($ao eq $bo)? 0 : ($ao eq "-") ? 1 : -1;
  return ($ar cmp $br || $ocmp || $ab <=> $bb || $be <=> $ae); # sort all -strand  last
}

sub bestlocHspset {
  my($hsploc, $tsid, $ts_start, $ts_end, $ts_strand)= @_;
  my @before=(); my @after=();
  my $skiphsp= 0;
  my($trange0, $trange1)= ($ts_start - MAX_EXON_SEPARATION, $ts_end + MAX_EXON_SEPARATION);
  foreach my $hspval (@$hsploc) { # instead of foreach can we hash-find nearby hsps?
    my($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
        $bit_score,  $prob, $tkey, $s_strand,
        $soverlapped, $qoverlapped)
        = @$hspval;
    next unless($sid eq $tsid && $s_strand eq $ts_strand && $s_start > $trange0 && $s_end < $trange1);
    if($s_end <= $ts_start + QBASEOVER) {
      unshift(@before, $hspval);
      }
    elsif($s_start >= $ts_end - QBASEOVER) {
      push(@after, $hspval);
      }
    }
  return (\@before, \@after); # sorted around hspbest
}


sub assignBestgene {  # version 3
  #? my($theqid) = @_; # not used

  my($topsid);
  $npart=0;
  my $saved=0;
  my $genenum = 0; my $lastgenenum= 0;
  my $lastexon= undef;
  @allsaved=(); # dont need global

  my @hspbest = sort _sortHsp_Score @sourcehsps;
  my @hsploc  = sort _sortHsp_SrcLocation @sourcehsps;

  # input sourcehsp should all be for same query-gene, sorted by location (NOT/was bitscore)
  # need location-sort to match up exon parts of genes
  my %donehsp=();
  my($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
        $bit_score,  $prob, $tkey, $s_strand,
        $soverlapped, $qoverlapped);

foreach my $hspbest (@hspbest) {
  ($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
        $bit_score,  $prob, $tkey, $s_strand,
        $soverlapped, $qoverlapped)
        = @$hspbest;
  my $topkey="$qid.$sid.$q_start.$s_start.$s_end.$bit_score";
# warn "#top $topkey\n"; #DEBUG
  next if $donehsp{$topkey}++;
  next if overlapsome($s_start, $s_end, \@allsaved, $OVERLAP_SLOP_GENOME);

  $genenum++;
  $topsid= $sid; ## if ($npart==0);

  my $keynum= $genenum;
  unless(ref $moregenes{$keynum}) { $moregenes{$keynum}= []; }
  push( @{$moregenes{$keynum}}, $hspbest); $saved=1;
  $sumhash{'other'}{($saved?'saved':'notsaved')}++;
  if ($saved) {
    push(@allsaved, $hspbest);
    $sumhash{'ALL'}{($saved?'saved':'notsaved')}++;
    }

  my($tsid, $tq_start, $tq_end, $ts_start, $ts_end, $ts_strand)=
    ($sid, $q_start, $q_end, $s_start, $s_end, $s_strand);
  my($trange0, $trange1)= ($ts_start - MAX_EXON_SEPARATION, $ts_end + MAX_EXON_SEPARATION);
  my($srange0, $srange1)= ($ts_start, $ts_end);
  my($qrange0, $qrange1)= ($tq_start, $tq_end);

    ## FIXME.3 this really needs to step thru @hsploc starting at $hspbest loc and go down,up from there
    ## otherwise qrange, srange are bad.  are getting two genes made from very good pieces of 1 gene match
    ## due to interior hsp's skipped in first pass nearest to hspbest.

  my(@before,@after);
  my($before, $after)= bestlocHspset(\@hsploc, $tsid, $ts_start, $ts_end, $ts_strand);

  foreach my $hspval (@$after, @$before) { # instead of foreach can we hash-find nearby hsps?
    # next unless (ref $hspval); #?

    ($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
        $bit_score,  $prob, $tkey, $s_strand,
        $soverlapped, $qoverlapped)
        = @$hspval;

    ## FIXME here; should not skip, but keep some of these to check;
    ## should not look far away if nearby exon fits; it it is done already or overlaps, count in qrange and skip on
    ## FIXME.2 new problem with this; far-hsp already done can eat away query-range; need to skip those

    ## already done# next unless($sid eq $tsid && $s_strand eq $ts_strand && $s_start > $trange0 && $s_end < $trange1);

    my $skiphsp= 0;
    my $atlockey="$qid.$sid.$q_start.$s_start.$s_end.$bit_score";
## warn "#at $atlockey\n"; #DEBUG
    next if($atlockey eq $topkey);
    $skiphsp=1 if $donehsp{$atlockey};
    $skiphsp=1 if $skiphsp || overlapsome($s_start, $s_end, \@allsaved, $OVERLAP_SLOP_GENOME);

    my $qover= overlap1($q_start, $q_end, $qrange0, $qrange1, $OVERLAP_SLOP_QUERY);
    next if($qover); # last?
    # $skiphsp=1 if ($qover);
    ## FIXME: need to look at qloc vs top-qloc; if -strand, @before must be higher qloc, @after lower
    ## and v.v.

    # if($skiphsp) now check that s_start,s_end is *near* top hsp; skip if not
    my $nearover= nearover1($s_start, $s_end, $srange0, $srange1, $OVERLAP_SLOP_GENOME, 5000); # GAPSIZE
    if($nearover >= 0) { next if $skiphsp; }

    if($s_end <= $ts_start + QBASEOVER) { # before
#       if($s_strand eq '-') { $qrange1 = $q_end if($q_end> $qrange1);  }
#       else {  $qrange0 = $q_start if($q_start< $qrange0); } # not before
      if($s_strand eq '-') { next if($q_start + QBASEOVER < $qrange1); $qrange1 = $q_end if($q_end> $qrange1);  }
      else { next if($q_end - QBASEOVER > $qrange0);  $qrange0 = $q_start if($q_start< $qrange0); } # not before
      unshift(@before, $hspval) unless $skiphsp;
      $srange0 = $s_start unless $skiphsp;
      }

    elsif($s_start >= $ts_end - QBASEOVER) { # after; bug in next here
#       if($s_strand eq '-') {  $qrange0 = $q_start if($q_start < $qrange0); }
#       else {  $qrange1 = $q_end if($q_end > $qrange1);  } # not after
      if($s_strand eq '-') { next if($q_end - QBASEOVER > $qrange0); $qrange0 = $q_start if($q_start< $qrange0); }
      else { next if($q_start  + QBASEOVER < $qrange1);  $qrange1 = $q_end if($q_end> $qrange1);  } # not after
      push(@after, $hspval) unless $skiphsp;
      $srange1 = $s_end unless $skiphsp;
      }
##warn "#skip=$skiphsp $atlockey\n"; #DEBUG
  }

  #? limit before, after size? dont try to find what isn't there by too far a match
  foreach my $hspval (@before, @after) { # .. and after

    ($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
        $bit_score,  $prob, $tkey, $s_strand,
        $soverlapped, $qoverlapped)
        = @$hspval;
    my $atlockey="$qid.$sid.$q_start.$s_start.$s_end.$bit_score";
    $saved= 0; ##no## $genenum= 0;

    if(1) {
      #my $keynum= "G" . $genenum;
      my $keynum=  $genenum;
      unless(ref $moregenes{$keynum}) { $moregenes{$keynum}= []; }
      push( @{$moregenes{$keynum}}, $hspval); $saved=1;
      $sumhash{'other'}{($saved?'saved':'notsaved')}++;
    }

    if ($saved) {
      $lastexon= $hspval;
      $lastgenenum= $genenum;
      $donehsp{$atlockey}++;
      push(@allsaved, $hspval);
      $sumhash{'ALL'}{($saved?'saved':'notsaved')}++;
      }
   $npart++;
  }
}

}

sub printGeneLocations {
  my($qid) = @_; # not used

  if(@sourcehsps && $qid && 0 < $didid{$qid}++) { die "ERROR: $qid already seen\n$USAGE"; }

  assignBestgene(); # NEW

  my $ng= printOneLocation(\@mainexons, 'G1'); # not used now
  $ng  += printOneLocation(\@secondexons, 'G2'); # not used now
  foreach my $keynum (sort{$a<=>$b} keys %moregenes) { ## NOW NUMERIC KEY
    (my $gnum= $keynum) =~ s/^(.)[a-z]+/$1/;  ## S2..S9; o2..o9
    $gnum = 'G'.$gnum;
    $ng += printOneLocation($moregenes{$keynum}, $gnum);
    }
  return $ng;
}


sub printOneLocation {
  my($exons, $igene) = @_;
  return 0 unless(ref $exons && @$exons > 0);

  my $qdb=".";
  my @locs= ();
  my($sum_bit_score,$sum_align,$iex)=(0)x10;
  my($qid1,$qid,$sid,$alignment_length,$q_start,$q_end,$s_start,$s_end,
      $bit_score, $prob, $tkey,$s_strand, $soverlapped, $qoverlapped);

  my @gffout= ();
  my($g_start,$g_end, $g_strand)=(undef,0);
  foreach my $ex ((sort{ $$a[5] <=> $$b[5] } @$exons)) {
    ($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
      $bit_score,  $prob, $tkey, $s_strand,
      $soverlapped, $qoverlapped)= @$ex;

    $qdb= $querySource; # default
    ($qdb,$qid1)= ($qid =~m/:/) ? split(/[:]/, $qid,2) : ($qdb,$qid);

    $g_start= $s_start unless(defined $g_start);
    $g_end= $s_end;
    $g_strand= $s_strand;
    $iex++;
    $sum_bit_score += $bit_score;
    $sum_align += $alignment_length;
    if(1) { # == GFF_LOC
      $tkey =~ s/^\w+://;  $tkey =~ s/,/-/g; # extra dbid and no commas in values
      #was# $attr= "Parent=${qid1}_${igene};tkey=$tkey;tloc=$q_start-$q_end;align=$alignment_length";
      my $attr= "Parent=${qid1}_${igene}";
      $attr.= ";Target=$qid $q_start $q_end;align=$alignment_length" if($dotarget);
      # print $gffout
      push @gffout, join("\t",
      ($sid, $qdb, $exonType, $s_start, $s_end, $bit_score, $s_strand,".",$attr)). "\n";
      }
    # else { push(@locs,"$s_start..$s_end"); }
    }

  $max_bit_score = $sum_bit_score if($sum_bit_score > $max_bit_score ); # assumes output by best match 1st
  return if ($sum_bit_score / $max_bit_score < $LOWSCORE_SKIP);

  if($addgene) {
    my $attr= "ID=${qid1}_${igene}";
    $attr.= ";Target=$qid;align=$sum_align" if($dotarget);
    unshift @gffout, join("\t",
      ($sid, $qdb, $geneType, $g_start, $g_end, $sum_bit_score, $g_strand,".",$attr)). "\n";
  }

  print $gffout @gffout;
  return 1;
}

sub overlapsome { # need scaffold?
  my($s_start, $s_end, $exons, $OVERLAP_SLOP)= @_;
  foreach my $ex (@$exons) {
    return 1 if overlap1($s_start, $s_end, $$ex[5], $$ex[6], $OVERLAP_SLOP);
  }
  return 0;
}

sub overlap1 {
  my($s_start, $s_end, $b_start, $b_end, $OVERLAP_SLOP)= @_;
  return 0 unless ( $s_start < $b_end && $s_end > $b_start  ); ## no overlap
  return 1 if ( $s_start >= $b_start && $s_end <= $b_end ); # contained-in
  if ( $s_end > $b_start && $s_start < $b_end ) {
    ## e.g.  s= 20,50  ; b = 10,30 : o= 10* or 40
    my $b_len= 1 + $b_end - $b_start;
    my $s_len= 1 + $s_end - $s_start; # choose ? biggest
    my $olp1 =  $b_end - $s_start; # 30-20 = 10
    my $olp2 =  $s_end - $b_start; # 50-10 = 40
    $olp1= $olp2 if ($olp2<$olp1);
    $s_len= $b_len if ($s_len<$b_len);
    return 1 if (($olp1 / $s_len) > $OVERLAP_SLOP);
    }
  return 0;
}

sub nearover1 {  # near == -1; over == +1; other == 0
  my($s_start, $s_end, $b_start, $b_end, $OVERLAP_SLOP, $gapsize)= @_;

  # need some overlap slop here; note locs here are QUERY == protein; should not be large
  unless ( $s_start < ($b_end - QBASEOVER) && $s_end > ($b_start + QBASEOVER)  ) {
    if($s_start >= $b_end) { return ($s_start - $b_end < $gapsize) ? -1 : 0; }
    elsif($b_start >= $s_end) { return ($b_start - $s_end < $gapsize) ? -1 : 0; }
    return 0;
  }

  return overlap1($s_start, $s_end, $b_start, $b_end, $OVERLAP_SLOP);
}


sub minabs {
  my($a,$b)= @_;
  $a= abs($a); $b= abs($b);
  return ($a>$b)? $b : $a;
}

sub toofar1 {
  my($q_start, $q_end, $b_start, $b_end)= @_;
  return 1 if minabs($b_end - $q_start,$q_end - $b_start) > MAX_EXON_SEPARATION;
  return 0;
}

sub cleanid {
  unless( $_[0] =~ s/$faprefix//) { $_[0] =~ s/^gi\|\d+\|(\S+)/$1/; }  # drop gi nums for ids
  $_[0] =~ s/\|$//; ## some ids have '|' at end of id; chomp
  $_[0] =~ s/\|/:/; #?? change pipe to db:id format
}

__END__

my $bestsort=0; # fixme

# NOT USED NOW
sub storeBesthits {
  my($qid,$sid,$s_start,$s_end,$bit_score)= @_;

  ## binned top hits for filterBesthits ; can we do this inline ?
  my($qdb,$qid1)= ($qid =~m/:/) ? split(/[:]/, $qid,2) : ($querySource,$qid);
  my ($sbin)= ($BIN10000 * int($s_start/$BIN10000)) ; # must match besthash bin size
  my $bestkey="$qdb.$sid.$sbin";
  my @bv= ($bit_score,$s_start,$s_end,$qid);
  push( @{$besthash{$bestkey}}, \@bv); # arrayref of arrayref

  $bestkey="$qdb"; # best over all genome == _G1
  if( ! $besthash{$bestkey} ) {
    $besthash{$bestkey}= [\@bv];
  } elsif($bit_score > $besthash{$bestkey}->[0]->[0]) {
    $besthash{$bestkey}->[0]= \@bv;
  }
}

# NOT USED NOW
sub filterBesthits {
  my($qid,$sid,$s_start,$s_end,$s_strand,$bit_score)= @_;

  unless($bestsort>0) {
    $bestsort++;
    foreach my $akey (sort keys %besthash) {
      my @blocs= sort{$b->[0] <=> $a->[0]} @ { $besthash{$akey} };
      $besthash{$akey}= \@blocs;
      }
    }

  my $snotbest= 0;
  return 0 unless(scalar(%besthash));

  my ($qdb,$qid1)= ($qid =~m/:/) ? split(/[:]/, $qid,2) : ($querySource,$qid);
  my ($sbin)= ($BIN10000 * int($s_start/$BIN10000)) ; # must match besthash bin size
  my($sdb,$sid1)= ($sid =~/:/) ? split(/[:]/, $sid,2) : ('',$sid);
  my $bestkey="$qdb.$sid1.$sbin";
  my $isbest = $besthash{$bestkey};

  if($isbest) {
    my @blocs= @ { $besthash{$bestkey} };
    foreach my $bl (@blocs) {
      my($bbits,$bb,$be,$bqid)= @$bl;
      $snotbest= overlap1($s_start, $s_end, $bb, $be, $OVERLAP_SLOP_GENOME);
      if($snotbest) {
        $snotbest= 0 if($bit_score >= $bbits || $qid eq $bqid); # keep self match
        last;
        }
      }
    $sumhash{'besthits'}{($snotbest?'no':'yes')}++;
    }
  else {
    $sumhash{'besthits'}{'missed'}++;
    }
  return $snotbest;
}


# NOT USED NOW
sub pushGeneExons {
  my($exons, $hspval, $q_start,$q_end,$s_start,$s_end) = @_;

  my $qoverlapped=0;
  my $toofar= -1;
  foreach my $ex (@$exons) {
    my ($bs,$be)= ($$ex[3],$$ex[4]);
    $qoverlapped= overlap1($q_start,$q_end, $bs, $be, $OVERLAP_SLOP_QUERY);
    last if ($qoverlapped);
    if($toofar != 0) { $toofar= toofar1($s_start,$s_end, $$ex[5],$$ex[6]); }
    }
  return 0 if($qoverlapped || $toofar>0);
  push(@$exons, $hspval); # [$qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end, $bit_score,  $prob]
  return 1;
}




## this one is out; not good enough
#
# sub assignBestgene1 {
#   my($tophsp);
#   $npart=0;
#   my $saved=0;
#   my $genenum = 0; my $lastgenenum= 0;
#   my $lastexon= undef;
#
#   # input sourcehsp should all be for same query-gene, sorted by location (NOT/was bitscore)
#   # need location-sort to match up exon parts of genes
#
#   foreach my $hspval (@sourcehsps) {
#     my($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
#         $bit_score,  $prob, $tkey, $s_strand,
#         $soverlapped, $qoverlapped)
#         = @$hspval;
#     $saved= 0; $genenum= 0;
#
# ## test this; not working right
# ## really need to test, for each hsp?, nearby hsp both directions and need to keep strand same
#     my $overlast=0;
#     foreach my $ex ($lastexon) {
#       my ($bsid,$bs,$be)= ($$ex[1],$$ex[3],$$ex[4]);
#       next unless($bsid eq $sid);
#       # $overlast= overlap1($q_start, $q_end, $bs, $be, $OVERLAP_SLOP_QUERY);
#       # last if ($overlast);
#       my $nearlast= nearover1($q_start, $q_end, $bs, $be, $OVERLAP_SLOP_QUERY);
#       if ($nearlast<0) { $genenum= $lastgenenum; last; }
#       elsif($nearlast>0) { $overlast=1; last; }
#       }
#     # next if $overlast; #??
#
#     $tophsp= $sid if ($npart==0);
#     my $issame= ($sid eq $tophsp);
#     my $scafpart;
#     if($npart==0 || $issame) { $tandhash{'Samescaf'}{$tkey}++; $scafpart='Samescaf'; }
#     if($npart==0 || !$issame) { $tandhash{'otherscaf'}{$tkey}++; $scafpart= 'otherscaf' if($npart>0); }
#
#     $genenum = $tandhash{$scafpart}{$tkey} unless ($genenum);
#         # ^^ this is easily bad : tkey and messes up joining exon-parts of gene : splits too much
#
#     if( $soverlapped  ) {
#       $sumhash{'other_soverlap'}{($saved?'saved':'notsaved')}++;
#
#     #} elsif( $overlast  ) { # this is problem
#     #  $sumhash{'other_qoverlap'}{($saved?'saved':'notsaved')}++;
#
#     } elsif($sid eq $tophsp && $genenum == 1) {  #  primary-match exons
#       my $snotbest= 0; #?? filterBesthits($qid,$sid,$s_start,$s_end,$s_strand,$bit_score);
#       if ($snotbest) { $saved=0; }
#       else { $saved= pushGeneExons(\@mainexons, $hspval, $q_start,$q_end, $s_start,$s_end); }
#       $sumhash{'G1'}{($saved?'saved':'notsaved')}++;
#       }
#
# ## test off; need this?
# #     elsif($sid eq $tophsp && $genenum == 2) { #  2nd-match exons
# #       my $snotbest= filterBesthits($qid,$sid,$s_start,$s_end,$s_strand,$bit_score);
# #       if ($snotbest) { $saved=0; }
# #       else { $saved= pushGeneExons(\@secondexons, $hspval, $q_start,$q_end, $s_start,$s_end); }
# #       $sumhash{'G2'}{($saved?'saved':'notsaved')}++;
# #       }
#
#     elsif( !$soverlapped ) { # other-match exons
#       my $snotbest= filterBesthits($qid,$sid,$s_start,$s_end,$s_strand,$bit_score);
#       $saved= 0;
#       unless($snotbest) {
#         my $keynum= $scafpart . $genenum;
#         unless(ref $moregenes{$keynum}) { $moregenes{$keynum}= []; }
#         $saved= pushGeneExons( $moregenes{$keynum}, $hspval, $q_start,$q_end, $s_start,$s_end);
#       }
#       $sumhash{'other'}{($saved?'saved':'notsaved')}++;
#     }
#
#     if ($saved) {
#       $lastexon= $hspval;
#       $lastgenenum= $genenum;
#       push(@allsaved, $hspval);
#       $sumhash{'ALL'}{($saved?'saved':'notsaved')}++;
#       }
#    $npart++;
#   }
# }


## this one is out; not good enough
# sub assignBestgene_OLD {
#   my($tophsp);
#   $npart=0;
#   my $saved=0;
#   foreach my $hspval (@sourcehsps) { # these should all be for same query-gene, sorted by bitscore
#      my($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
#         $bit_score,  $prob, $tkey, $s_strand,
#         $soverlapped, $qoverlapped)
#         = @$hspval;
#
#     $tophsp= $sid if ($npart==0);
#     my $issame= ($sid eq $tophsp);
#     my $scafpart;
#     if($npart==0 || $issame) { $tandhash{'Samescaf'}{$tkey}++; $scafpart='Samescaf'; }
#     if($npart==0 || !$issame) { $tandhash{'otherscaf'}{$tkey}++; $scafpart= 'otherscaf' if($npart>0); }
#
#     my $genenum = $tandhash{$scafpart}{$tkey}; ## need same/diff num
#         # ^^ this is easily bad : tkey and messes up joining exon-parts of gene : splits too much
#
#     if($sid eq $tophsp && $genenum == 1) {  #  primary-match exons
#       my $snotbest= 0; #?? filterBesthits($qid,$sid,$s_start,$s_end,$s_strand,$bit_score);
#       if ($snotbest) { $saved=0; }
#       else { $saved= pushGeneExons(\@mainexons, $hspval, $q_start,$q_end, $s_start,$s_end); }
#       $sumhash{'G1'}{($saved?'saved':'notsaved')}++;
#       }
#
#     elsif($sid eq $tophsp && $genenum == 2) { #  2nd-match exons
#       my $snotbest= filterBesthits($qid,$sid,$s_start,$s_end,$s_strand,$bit_score);
#       if ($snotbest) { $saved=0; }
#       else { $saved= pushGeneExons(\@secondexons, $hspval, $q_start,$q_end, $s_start,$s_end); }
#       $sumhash{'G2'}{($saved?'saved':'notsaved')}++;
#       }
#
#    elsif( !$soverlapped) { # other-match exons
#       my $snotbest= filterBesthits($qid,$sid,$s_start,$s_end,$s_strand,$bit_score);
#       $saved= 0;
#       unless($snotbest) {
#       my $keynum= $scafpart . $genenum;
#       unless(ref $moregenes{$keynum}) { $moregenes{$keynum}= []; }
#       $saved= pushGeneExons( $moregenes{$keynum}, $hspval, $q_start,$q_end, $s_start,$s_end);
#       }
#       $sumhash{'other'}{($saved?'saved':'notsaved')}++;
#     }
#
#     if ($saved) {
#       push(@allsaved, $hspval);
#       $sumhash{'ALL'}{($saved?'saved':'notsaved')}++;
#       }
#    $npart++;
#   }
# }
#


# sub assignBestgene2 {  # version 2
#
#   my($topsid);
#   $npart=0;
#   my $saved=0;
#   my $genenum = 0; my $lastgenenum= 0;
#   my $lastexon= undef;
#   @allsaved=(); # dont need global
#
#   my @hspbest = sort _sortHsp_Score @sourcehsps;
#   my @hsploc  = sort _sortHsp_SrcLocation @sourcehsps;
#
#   # input sourcehsp should all be for same query-gene, sorted by location (NOT/was bitscore)
#   # need location-sort to match up exon parts of genes
#   my %donehsp=();
#   my($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
#         $bit_score,  $prob, $tkey, $s_strand,
#         $soverlapped, $qoverlapped);
#
# foreach my $hspbest (@hspbest) {
#   ($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
#         $bit_score,  $prob, $tkey, $s_strand,
#         $soverlapped, $qoverlapped)
#         = @$hspbest;
#   my $topkey="$qid.$sid.$q_start.$s_start.$s_end.$bit_score";
#   next if $donehsp{$topkey}++;
#   next if overlapsome($s_start, $s_end, \@allsaved, $OVERLAP_SLOP_GENOME);
#
#   $genenum++;
#   $topsid= $sid; ## if ($npart==0);
#
#   my $keynum= $genenum;
#   unless(ref $moregenes{$keynum}) { $moregenes{$keynum}= []; }
#   push( @{$moregenes{$keynum}}, $hspbest); $saved=1;
#   $sumhash{'other'}{($saved?'saved':'notsaved')}++;
#   if ($saved) {
#     push(@allsaved, $hspbest);
#     $sumhash{'ALL'}{($saved?'saved':'notsaved')}++;
#     }
#
#   my($tsid, $tq_start, $tq_end, $ts_start, $ts_end, $ts_strand)=
#     ($sid, $q_start, $q_end, $s_start, $s_end, $s_strand);
#   my($trange0, $trange1)= ($ts_start - MAX_EXON_SEPARATION, $ts_end + MAX_EXON_SEPARATION);
#   my($srange0, $srange1)= ($ts_start, $ts_end);
#   my($qrange0, $qrange1)= ($tq_start, $tq_end);
#   my(@before,@after);
#
#   foreach my $hspval (@hsploc) { # instead of foreach can we hash-find nearby hsps?
#
#     ($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
#         $bit_score,  $prob, $tkey, $s_strand,
#         $soverlapped, $qoverlapped)
#         = @$hspval;
#
#     ## FIXME here; should not skip, but keep some of these to check;
#     ## should not look far away if nearby exon fits; it it is done already or overlaps, count in qrange and skip on
#     ## FIXME.2 new problem with this; far-hsp already done can eat away query-range; need to skip those
#
#     ## FIXME.3 this really needs to step thru @hsploc starting at $hspbest loc and go down,up from there
#     ## otherwise qrange, srange are bad.  are getting two genes made from very good pieces of 1 gene match
#     ## due to interior hsp's skipped in first pass nearest to hspbest.
#
#
#     my $skiphsp= 0;
#     next unless($sid eq $tsid && $s_strand eq $ts_strand && $s_start > $trange0 && $s_end < $trange1);
#
#     my $atlockey="$qid.$sid.$q_start.$s_start.$s_end.$bit_score";
#     $skiphsp=1 if $donehsp{$atlockey};
#     $skiphsp=1 if $skiphsp || overlapsome($s_start, $s_end, \@allsaved, $OVERLAP_SLOP_GENOME);
#
#     my $qover= overlap1($q_start, $q_end, $qrange0, $qrange1, $OVERLAP_SLOP_QUERY);
#     next if($qover); # last?
#     # $skiphsp=1 if ($qover);
#     ## FIXME: need to look at qloc vs top-qloc; if -strand, @before must be higher qloc, @after lower
#     ## and v.v.
#
#     # if($skiphsp) now check that s_start,s_end is *near* top hsp; skip if not
#     my $nearover= nearover1($s_start, $s_end, $srange0, $srange1, $OVERLAP_SLOP_GENOME, 1000); # GAPSIZE
#     if($nearover >= 0) { next if $skiphsp; }
#
#     if($s_end <= $ts_start + QBASEOVER) {
#       if($s_strand eq '-') { next if($q_end < $qrange1); $qrange1 = $q_end if($q_end> $qrange1);  }
#       else { next if($q_start > $qrange0);  $qrange0 = $q_start if($q_start< $qrange0); } # not before
#       unshift(@before, $hspval) unless $skiphsp;
#       $srange0 = $s_start unless $skiphsp;
#       }
#     elsif($s_start >= $ts_end - QBASEOVER) {
#       if($s_strand eq '-') { next if($q_start > $qrange0); $qrange0 = $q_start if($q_start< $qrange0); }
#       else { next if($q_end < $qrange1);  $qrange1 = $q_end if($q_end> $qrange1);  } # not after
#       push(@after, $hspval) unless $skiphsp;
#       $srange1 = $s_end unless $skiphsp;
#       }
#   }
#
#   #? limit before, after size? dont try to find what isn't there by too far a match
#   foreach my $hspval (@before, @after) { # .. and after
#
#     ($qid,$sid,$alignment_length, $q_start,$q_end,$s_start,$s_end,
#         $bit_score,  $prob, $tkey, $s_strand,
#         $soverlapped, $qoverlapped)
#         = @$hspval;
#     my $atlockey="$qid.$sid.$q_start.$s_start.$s_end.$bit_score";
#     $saved= 0; ##no## $genenum= 0;
#
#     if(1) {
#       #my $keynum= "G" . $genenum;
#       my $keynum=  $genenum;
#       unless(ref $moregenes{$keynum}) { $moregenes{$keynum}= []; }
#       push( @{$moregenes{$keynum}}, $hspval); $saved=1;
#       $sumhash{'other'}{($saved?'saved':'notsaved')}++;
#     }
#
#     if ($saved) {
#       $lastexon= $hspval;
#       $lastgenenum= $genenum;
#       $donehsp{$atlockey}++;
#       push(@allsaved, $hspval);
#       $sumhash{'ALL'}{($saved?'saved':'notsaved')}++;
#       }
#    $npart++;
#   }
# }
#
# }