-
Notifications
You must be signed in to change notification settings - Fork 4
/
Random_sequence_selector.pl
96 lines (74 loc) · 2.89 KB
/
Random_sequence_selector.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
our($opt_s,$opt_l,$opt_n,$opt_r);
getopt('slnr');
my$usage="Usage: perl script options.....
-s sequence
-l length_of_fragment
-n Number of fragments to sample[100]
-r remove the Ns[False]\n";
print "\n$usage\n";
die "Cannot find Seqenuce file\n$usage" if !defined $opt_s;
$opt_l=500 if !defined $opt_l;
my $output=$opt_l."_nt_".$opt_n."_".$opt_s;
open OUT,">$output";
my%sequence=ReadFasta($opt_s);
foreach (keys%sequence){ # for every sequence
my$seq_length=length($sequence{$_});
my$count=0;
my$seq_name=$_;
for(my$i=0;$i<$opt_n;$i++){ # start choppping sequence from start
my$random_number=int(rand($seq_length-$opt_l-1)); # generate random start site
my$end_site=$random_number+$opt_l;
my$start_site=$random_number+1;
if(($seq_length-$random_number)>$opt_l){#for fragments larger than chopping size
my$dna_fragment=substr($sequence{$_},$random_number,$opt_l);
print OUT">$seq_name($start_site - $end_site)\n$dna_fragment\n";
}
else{#for fragments smaller than chopping size.
my$dna_fragment=substr($sequence{$_},$random_number);
print OUT">$seq_name($start_site - $seq_length)\n$dna_fragment\n";
print "Sequence smaller than requested length\n";
}
$count++;
}
print "$count fragments of $opt_l bases each from $_ .\n";
}
#-------------------------------------Start ReadFasta---------------------------------------+
sub ReadFasta{ # to read fasta format files into hash. returns hash.
my $seqfile=shift(@_);
my ($header,@sequence);
chomp $seqfile;
open FASTA,"$seqfile";
print "reading Sequences from input file.....Plz wait...\n";
my%seq_hash=();
$/="\n>"; # Change record seperator to read Fasta
my$sequence1=();
while(<FASTA>){
chomp;
($header,@sequence)=split("\n",$_);
$header=~s/>//; # Remove Leading > from Header
$header=~s/\s*$//; # Remove trailing spaces from header
$header=~s/^\s*//; # Remove Leading spaces from Header
if($header=~/^\s*$/){next;}
my$sequence= join("",@sequence);
$sequence=~ s/\s//g;
$sequence=~s/\n//g;
if(lc($opt_r) eq lc('true')){$sequence=~s/N//gi;} # remove Ns from the sequence
$sequence1.=$sequence; # join the sequences into one sequence
$header=();
@sequence=();
$sequence="";
}
$seq_hash{$seqfile}=$sequence1; #feed headers and sequences in hash.
my @seq_count=keys (%seq_hash);
my $seq_count=@seq_count;
my $seqlength=length($sequence1);
print "Done....\nNumber of sequences read form input file = $seq_count\n\n";
print "Done....\nTotal length of sequence read form input file = $seqlength\n\n";
@seq_count=();
$/="\n"; # Record seperator set back to default newline.
return(%seq_hash);
}
#-------------------------------------End ReadFasta---------------------------------------+