-
Notifications
You must be signed in to change notification settings - Fork 4
/
Filter_masked_Seqs.pl
144 lines (99 loc) · 4.51 KB
/
Filter_masked_Seqs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
# this script can remove sequences from multi fasta files based on % of masked region and length.
our($inputfile,$outputfile,$count,%seq_hash);
our($opt_s,$opt_o,$opt_l,$opt_p,$opt_n);
getopt('solpn');
my$usage= "
This script will filter sequences masked by the any masker programs from non masked ones. User can provide filtering criteria.
\n\nusage : perl script_name [options]
options:
-s Sequence file containing sequences in fasta format
-l Min Length of the sequence to keep [1].
-n Min Number of non-masked nucleotide in sequence [0].
-p Min Percent of sequence mask allowed [100].
-o Output file to store resulting sequences.
";
if(!$opt_s){print $usage;}
#opening inputfile containing sequences in fasta format.
if ($opt_s){$inputfile=$opt_s}elsif($ARGV[0]){$inputfile=$ARGV[0]}else{die "\nPlease provide sequence file to filter\n"}
chomp ($inputfile);
open INPUT,"<$inputfile" or die "Cannot open $inputfile.....\n\n";
$opt_l = 1 if !defined $opt_l;
$opt_p = 100 if !defined $opt_p;
$opt_n = 0 if !defined $opt_n;
# Read fasta file in hash. Change the names if name s not uniq
%seq_hash=ReadFasta($inputfile);
#opening outputfile
if ($opt_o){$outputfile=$opt_o;} else{$outputfile='maskfiltered_'.'l-'.$opt_l.'-P_'.$opt_p.'_'.'n-'.$opt_n.'_'.$opt_s;}
if($outputfile){
chomp($outputfile) ;
open OUT,">$outputfile" or die "cannot create $outputfile.....\n\n" ;
}
#*************************************************************************************************************************
#read all the pattern or key words into array @pattern. and search for the matching one in abovemade hash.
my $count_found=my $count_notfound=0;
print"Filtering Masked sequences...Plz wait....\n";
foreach my$Seq_header(keys %seq_hash){
# Count Ns in sequence and calculate % of sequence masked
my$mask=($seq_hash{$Seq_header}=~tr/Nn/Nn/);
my$non_mask = ($seq_hash{$Seq_header}=~tr/ATGCatgc/ATGCatgc/);
my$seq_length = length($seq_hash{$Seq_header});
my$mask_percent=(100*$mask)/$seq_length;
#filter for % mask and length
if($seq_length >= $opt_l && $mask_percent <= $opt_p && $non_mask >= $opt_n){
print OUT">$Seq_header\n$seq_hash{$Seq_header}\n";
$count_found++;
}
else{$count_notfound++;}
}
print "\nNumber of sequences written to output = $count_found\n";
print "Number of sequences excluded = $count_notfound\n";
#*************************************************************************************************************************
#exit the program and close all files.
close INPUT;
close OUT;
exit;
#######################################################
#-------------------------------------Start ReadFasta---------------------------------------+
sub ReadFasta{ # to read fasta format files into hash. returns hash.
my $seqfile=shift(@_);
my ($header,@sequence);
chomp $seqfile;
open FASTA,"$seqfile";
print "reading Sequences from input file.....Plz wait...\n";
my%seq_hash=();
#$seq_hash{'RS_Concatenated'}="";
$/="\n>"; # Change record seperator to read Fasta
my$last_N=1;
while(<FASTA>){
chomp;
($header,@sequence)=split("\n",$_);
$header=~s/>//; # Remove Leading > from Header
$header=~s/\s*$//; # Remove trailing spaces from header
$header=~s/^\s*//; # Remove Leading spaces from Header
my$sequence= join("",@sequence);
$sequence=~ s/\s//g;
$sequence=~s/\n//g;
if($header=~/^\s*$/){next;}
# Make sure no duplicate header exists else they will be overwritten and lost during hashing due to same key.
if(!exists $seq_hash{$header}){
$seq_hash{$header}=$sequence; #feed headers and sequences in hash.
#$seq_hash{'RS_Concatenated'}=$seq_hash{'RS_Concatenated'}.$Ns.$sequence;
}
else {
# find a uniq header name by adding a number at the end. If header still exists, increase the number by one
while(exists $seq_hash{$header}){$header=$header.$last_N;$last_N++;}
$seq_hash{$header}=$sequence;
#$seq_hash{'RS_Concatenated'}=$seq_hash{'RS_Concatenated'}.$Ns.$sequence;
}
}
my @seq_count=keys (%seq_hash);
my $seq_count=@seq_count;
print "Done....\nNumber of sequences read form input file = $seq_count\n\n";
@seq_count=();
$/="\n"; # Record seperator set back to default newline.
return(%seq_hash);
}
#-------------------------------------End ReadFasta---------------------------------------+