Skip to content

Commit

Permalink
Merge pull request #85 from dfornika/easy-un-pair
Browse files Browse the repository at this point in the history
Addition of multiple output tags to allow un-pairing of paired reads for classified/unclassified read output (written by @dfornika)
  • Loading branch information
jenniferlu717 authored Oct 30, 2017
2 parents f2910a8 + 999635a commit 5bb5be0
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 36 deletions.
17 changes: 14 additions & 3 deletions scripts/kraken
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ my $quick = 0;
my $min_hits = 1;
my $fasta_input = 0;
my $fastq_input = 0;
my $fastq_output = 0;
my $db_prefix;
my $threads;
my $preload = 0;
Expand All @@ -55,6 +56,7 @@ my $check_names = 0;
my $only_classified_output = 0;
my $unclassified_out;
my $classified_out;
my $output_format = "legacy";
my $outfile;

GetOptions(
Expand All @@ -64,10 +66,12 @@ GetOptions(
"threads=i" => \$threads,
"fasta-input" => \$fasta_input,
"fastq-input" => \$fastq_input,
"fastq-output" => \$fastq_output,
"quick" => \$quick,
"min-hits=i" => \$min_hits,
"unclassified-out=s" => \$unclassified_out,
"classified-out=s" => \$classified_out,
"out-fmt=s" => \$output_format,
"output=s" => \$outfile,
"preload" => \$preload,
"paired" => \$paired,
Expand Down Expand Up @@ -137,14 +141,17 @@ push @flags, "-d", $kdb_file;
push @flags, "-i", $idx_file;
push @flags, "-t", $threads if $threads > 1;
push @flags, "-n", $taxonomy if defined $taxonomy;
push @flags, "-q" if $quick;
push @flags, "-q", if $quick;
push @flags, "-m", $min_hits if $min_hits > 1;
push @flags, "-f" if $fastq_input && ! $paired; # merger always outputs FASTA
push @flags, "-f", if $fastq_input;
push @flags, "-F", if $fastq_output;
push @flags, "-U", $unclassified_out if defined $unclassified_out;
push @flags, "-C", $classified_out if defined $classified_out;
push @flags, "-O", $output_format if defined $output_format;
push @flags, "-o", $outfile if defined $outfile;
push @flags, "-c", if $only_classified_output;
push @flags, "-M" if $preload;
push @flags, "-M", if $preload;
push @flags, "-P", if $paired;

# handle piping for decompression/merging
my @pipe_argv;
Expand All @@ -155,6 +162,7 @@ if ($paired) {
push @merge_flags, "--gz" if $gunzip;
push @merge_flags, "--bz2" if $bunzip2;
push @merge_flags, "--check-names" if $check_names;
push @merge_flags, "--output-format", $output_format if defined $output_format;
@pipe_argv = ("read_merger.pl", @merge_flags, @ARGV);
}
elsif ($compressed) {
Expand Down Expand Up @@ -210,6 +218,7 @@ Options:
--threads NUM Number of threads (default: $def_thread_ct)
--fasta-input Input is FASTA format
--fastq-input Input is FASTQ format
--fastq-output Output in FASTQ format
--gzip-compressed Input is gzip compressed
--bzip2-compressed Input is bzip2 compressed
--quick Quick operation (use first hit or hits)
Expand All @@ -219,6 +228,8 @@ Options:
Print unclassified sequences to filename
--classified-out FILENAME
Print classified sequences to filename
--out-fmt FORMAT Format for [un]classified sequence output. supported
options are: {legacy, paired, interleaved}
--output FILENAME Print output to filename (default: stdout); "-" will
suppress normal output
--only-classified-output
Expand Down
59 changes: 48 additions & 11 deletions scripts/read_merger.pl
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@
my $gunzip = 0;
my $bunzip2 = 0;
my $check_names = 0;
my $output_format;

GetOptions(
"fa" => \$fasta_input,
"fq" => \$fastq_input,
"gz" => \$gunzip,
"bz2" => \$bunzip2,
"check-names" => \$check_names
"check-names" => \$check_names,
"output-format=s" => \$output_format
);

if (@ARGV != 2) {
Expand Down Expand Up @@ -84,16 +86,31 @@

# read/merge/print loop
# make sure names match before merging
my ($seq1, $seq2);
my ($seq1, $seq2, $delimiter);
if ($output_format eq "legacy") {
$delimiter = 'N';
}
else {
$delimiter = '|';
}
while (defined($seq1 = read_sequence($fh1))) {
$seq2 = read_sequence($fh2);
if (! defined $seq2) {
die "$PROG: mismatched sequence counts\n";
}
if ($check_names && $seq1->{id} ne $seq2->{id}) {
die "$PROG: mismatched mate pair names ('$seq1->{id}' & '$seq2->{id}')\n";
if ($check_names) {
my $comparison_id1 = $seq1->{id} =~ /(\S+)/; # Only check up until first whitespace character
my $comparison_id2 = $seq2->{id} =~ /(\S+)/;
if ($comparison_id1 ne $comparison_id2) {
die "$PROG: mismatched mate pair names ('$seq1->{id}' & '$seq2->{id}')\n";
}
}
if ($fastq_input) {
print_merged_sequence_fastq($seq1, $seq2, $delimiter);
}
else {
print_merged_sequence($seq1, $seq2, $delimiter);
}
print_merged_sequence($seq1, $seq2);
}
if (defined($seq2 = read_sequence($fh2))) {
die "$PROG: mismatched sequence counts\n";
Expand All @@ -107,6 +124,7 @@
my $fh = shift;
my $id;
my $seq = "";
my $qual = "";
if (! exists $buffers{$fh}) {
$buffers{$fh} = <$fh>;
}
Expand All @@ -133,7 +151,14 @@
}
}
elsif ($fastq_input) {
if ($buffers{$fh} =~ /^@(\S+)/) {
my $fastq_header_regex;
if ($output_format eq "legacy") {
$fastq_header_regex = qr/^@(\S+)/;
}
else {
$fastq_header_regex = qr/^@(\S+\s\S+)/; # Allow one whitespace character in fastq header
}
if ($buffers{$fh} =~ $fastq_header_regex) {
$id = $1;
}
else {
Expand All @@ -145,20 +170,32 @@
delete $buffers{$fh};
chomp($seq = <$fh>);
scalar <$fh>; # quality header
scalar <$fh>; # quality values
chomp($qual = <$fh>); # quality values
}
else {
# should never get here
die "$PROG: I have no idea what kind of input I'm reading!!!\n";
}

$id =~ s/[\/_.][12]$//; # strip /1 (or .1, _1) or /2 to help comparison
return { id => $id, seq => $seq };
return { id => $id, seq => $seq, qual => $qual };
}
}

sub print_merged_sequence {
my ($seq1, $seq2) = @_;
my ($seq1, $seq2, $delimiter) = @_;
print ">" . $seq1->{id} . "\n";
print $seq1->{seq} . "N" . $seq2->{seq} . "\n";
print $seq1->{seq} . $delimiter . $seq2->{seq} . "\n";
}

sub print_merged_sequence_fastq {
my ($seq1, $seq2, $delimiter) = @_;
if ($output_format eq "legacy") {
print "@" . $seq1->{id} . "\n";
}
else {
print "@" . $seq1->{id} . $delimiter . $seq2->{id} . "\n";
}
print $seq1->{seq} . $delimiter . $seq2->{seq} . "\n";
print "+\n";
print $seq1->{qual} . $delimiter . $seq2->{qual} . "\n";
}
Loading

0 comments on commit 5bb5be0

Please sign in to comment.