-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_motifs_phylo.m
299 lines (245 loc) · 9.62 KB
/
find_motifs_phylo.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
function [ Z, S, mu, max_lr, min_ent, ...
min_ent_M, min_ent_s, ...
max_lr_M,max_lr_s, ...
posterior_mean_M, information,background ] = find_motifs_phylo(sequence_file,K, ...
n_iterations,burn_in, ...
a, mu_start, mu_unknown, beta)
% This code will run the Gibbs sampler motif detection algorithm of
% Lawrence et al. (1993) on a set of sequences inputted as a FASTA file.
%
% Joe Herman, Feb. 2013 ([email protected])
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% NOTES ON ARGUMENTS
%
%% sequence_file: a FASTA-formatted file containing the input sequences
%% K: the length of the motif
%% n_iterations: number of iterations for which Gibbs sampler
% should be run
%% burn_in: number of iterations to allow for the burn-in
% phase, while the MCMC is converging.
%% a: a constant multiplier for the uniform prior on the
% motif
%% mu_start: the starting value of mu
%% mu_unknown: 0 if mu is fixed to mu_start, 1 if mu is unknown and
% is to be estimated by MCMC.
%% beta: a length-two vector, containing prior parameters
% for mu (used only if mu_unknown == 1)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
alpha = ones(1,4) * a;
% The prior distribution for each column of the motif
[head, seqs] = fastaread1(sequence_file);
% read in a set of sequences from a FASTA formatted file
nseqs = length(seqs);
for (i = 1:nseqs)
seqs{i} = base2num(seqs{i});
% Turns a sequence of ACGT... into a numerical sequence of 1234...
end
background = compute_background(seqs);
% Compute the background distribution
z = ones(nseqs,1);
% Vector of zeros and ones to indicate which sequences contain the motif.
% This is to be used when we believe that some of the sequences do not
% contain the motif, otherwise all entries will stay as 1.
Z = ones(nseqs,n_iterations-1);
% This matrix (with the capitalised name) stores the value of
% z for each iteration of the simulation, for analysis purposes.
mu = mu_start;
% The probability of each sequence containing a motif
Mu = ones(n_iterations,1);
Mu(1) = mu;
% The value of mu from each iteration.
s = ones(nseqs,1);
% The start position of the motif in each sequence
S = ones(nseqs,n_iterations);
% A matrix that stores the s from each iteration.
%Random starting position
for (i = 1:nseqs) % For each sequence
L_i = length(seqs{i});
s(i) = randi([1,L_i-K+1],1);
S(:,i) = s(i);
end
min_ent = Inf; % Keep a record of the minimum PWM entropy
min_ent_M = zeros(4,K); % And the corresponding PWM
min_ent_s = ones(nseqs,1); % and starting positions
background_entropy = -sum(background .* log(background));
entropy = zeros(n_iterations,1);
% The above variables keep track of the entropy of the PWM, and store the
% PWM and starting positions for the minimum entropy PWM.
max_lr = 0; % The maximum observed likelihood ratio
max_lr_M = zeros(4,K);
max_lr_s = ones(nseqs,1);
lr = ones(n_iterations,1); % Record the likelihood ratio
% The above variables are used for the purposes of finding the PWM that
% maximises the likelihood ratio between the motif model and the random
% background model, as described in the accompanying explanatory document.
posterior_mean_M = zeros(4,K);
% The posterior mean PWM is updated once the Gibbs sampler has converged
% i.e. after the burn_in is over.
background_M = repmat(background,1,K);
% This is a background PWM derived from the inputted background
% frequencies.
M=sample_M(seqs,K,z,s,alpha); %initial M
for (iter = 1:n_iterations)
if (mod(iter,10)==0) % Every ten iterations
iter
% Print out where we're up to in the simulation
end
if (iter > 1)
Z(:,iter-1) = z;
% Store the previous value of the presence/absence vector
S(:,iter-1) = s;
% Store the previous value of the starting positins
if (mu_unknown)
mu(iter) = sample_mu(z,beta,nseqs);
else
mu(iter) = mu(iter-1);
end
end
for (i = 1:nseqs) % For each sequence
L_i = length(seqs{i});
M_new = sample_M(seqs,K,z,s,alpha);
% Compute the current PWM for the motif from all sequences.
% M will be a 4 x K array giving the probability of
% each base at each location along the motif.
% compute the probability ratio needed for Metropolis-Hastings part of algo (see nihms paper)
f = get_counts(seqs,K,z,s,alpha);
ratio = 1;
for kk=1:K
ratio=ratio*(felsenstein_likelihood(seqs,s,kk,M_new(:,kk)')/felsenstein_likelihood(seqs,s,kk,M(:,kk)'));
for nuc=1:4
ratio=ratio*(M(nuc,kk)^f(nuc,kk)/M_new(nuc,kk)^f(nuc,kk));
end
end
rr = rand(1);
if (rr < min(1,ratio))
M = M_new;
end
prob = zeros(L_i-K+1,1);
background_prob = zeros(L_i-K+1,1);
for (j = 1:(L_i-K+1)) % For each potential starting position
prob(j) = likelihood(seqs{i},j,M,K);
% likelihood of the motif starting here under the model
% defined by M
background_prob(j) = likelihood(seqs{i},j,background_M,K);
% likelihood of motif starting here under the random
% background model
end
% Now sample a new start position from the full conditional
[likelihood_ratio, s(i)] = sample_s(prob,background_prob,mu(iter),K);
% NB if a zero is sampled, it corresponds to the motif not being
% present in this particular sequence.
if (s(i) == 0)
% Then there is no motif in this sequence
z(i) = 0;
else
z(i) = 1;
% Form the product of the likelihood ratios for all of the
% sequences
lr(iter) = lr(iter) * likelihood_ratio;
end
end
if (iter > burn_in)
% i.e. if the chain has converged to the stationary distribution
posterior_mean_M = posterior_mean_M + M/(n_iterations-burn_in);
end
% Now update the min entropy and max likelihood ratio PWMs etc.
entropy(iter) = mean(-sum(M .* log(M))); % Mean entropy per site
if (iter > 1 && entropy(iter) < min_ent)
min_ent = entropy(iter);
min_ent_M = M;
min_ent_s = s;
end
if (iter > 1 && lr(iter) > max_lr)
max_lr = lr(iter);
max_lr_M = M;
max_lr_s = s;
end
% Store the last values:
Z(:,iter) = z;
S(:,iter) = s;
end
% Compute the average information per site (a vector of length
% n_iterations)
information = background_entropy - entropy;
end
function [ background ] = compute_background(seqs)
% This function returns a length-four column vector
% containing the background model for the sequences.
% The code defined below just uses a uniform distribution,
% but a better approach would be to compute the relative
% frequencies of each base in the sequences, and to use
% these as the background.
background = zeros(4,1);
for i=1:length(seqs)
seq = seqs{i};
for j=1:length(seq)
background(seq(j)) = background(seq(j)) + 1;
end
end
background = background / sum(background);
end
function f = get_counts(seqs,K,z,s,alpha)
f = zeros(4,K);
% for each sequence
for i=1:length(seqs)
seq = seqs{i};
% if I found the motif
if z(i)==1
% for each position in sequence
for k=1:K
j=k+s(i)-1;
% add one occurrence to the occurrence count
f(seq(j),k) = f(seq(j),k) + 1;
end
end
end
end
function [ M ] = sample_M(seqs,K,z,s,alpha)
f = get_counts(seqs,K,z,s,alpha);
M = zeros(4,K);
for k=1:K
M(:,k) = dirichrnd(alpha+f(:,k)');
end
end
function [ likelihood_ratio, s_i ] = sample_s(prob,background_prob,mu,K)
% 'prob' and 'background_prob' are vectors containing the
% probability of the motif starting at each possible site
% from 1 to L_i - K + 1 in sequence i, under the motif model M,
% and the background model G, respectively.
% This function returns a start position sampled according to
% its posterior probability, along with the corresponding
% likelihood ratio.
% If this function returns s_i = 0, it means that sequence i
% does not contain a copy of the motif.
L = length(prob);
likelihood_ratio = prob./background_prob;
prob_z=(L-K+1)*(1-mu)/((L-K+1)*(1-mu)+mu*sum(likelihood_ratio));
r = rand(1);
if (r < prob_z)
s_i=0;
likelihood_ratio = 1;
else
v = 1:length(prob);
s_i = randsample(v, likelihood_ratio);
likelihood_ratio = likelihood_ratio(s_i);
end
end
function [ mu ] = sample_mu(z,beta,nseqs)
% Samples mu from its full conditional, given the current values
% for z, and the prior parameters, beta.
% Matlab lists are 1-based so beta(1) = beta(0) on the instructions
a=beta(1)+sum(z);
b=beta(2)+nseqs-sum(z);
mu=betarnd(a,b);
end
function [ p ] = likelihood(sequence,s_i,M,K)
% Computes the likelihood of a subsequence of length K,
% beginning at s_i according to the model specified by M
p=1;
for (j = s_i:s_i+K-1) % from the starting point of the motif to its end
p = p*M(sequence(j),j-s_i+1);
% multiply p by the probability of the nucleotide 'sequence(j)' being at position
% 'j-s_i+1'. When j=s_i, j-s_i+1 = 1, which is good because
% the columns of the matrix start at 1
end
end